From abb3ddbd7d69be79c07d5e9a826cef8b8b07f1ac Mon Sep 17 00:00:00 2001
From: Saeid Alizadeh <saeidalz96@gmail.com>
Date: Sun, 8 Mar 2026 10:46:21 -0600
Subject: [PATCH] scripts

---
 .gitignore                     |   3 +-
 Dockerfile                     |   1 +
 TASKS.md                       | 175 -----------------
 docs/cold-start-analysis.md    | 138 +++++++++++++
 scripts/benchmark_coldstart.sh | 288 ++++++++++++++++++++++++++++
 scripts/benchmark_inference.sh | 341 +++++++++++++++++++++++++++++++++
 scripts/test_endpoint.sh       |   2 +-
 7 files changed, 771 insertions(+), 177 deletions(-)
 delete mode 100644 TASKS.md
 create mode 100644 docs/cold-start-analysis.md
 create mode 100755 scripts/benchmark_coldstart.sh
 create mode 100755 scripts/benchmark_inference.sh
diff --git a/.gitignore b/.gitignore
index 2eea525..c8db7c0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1 +1,2 @@
-.env
\ No newline at end of file
+.env
+results/
\ No newline at end of file
diff --git a/Dockerfile b/Dockerfile
index ba1bf8a..2ab89e3 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,3 +1,4 @@
+# TODO: this might be slow. llama cpp version might be faster
 FROM runpod/worker-v1-vllm:v2.13.1
 
 ENV MODEL_NAME="ibm-granite/granite-docling-258M"
diff --git a/TASKS.md b/TASKS.md
deleted file mode 100644
index e136356..0000000
--- a/TASKS.md
+++ /dev/null
@@ -1,175 +0,0 @@
-# Task: RunPod Serverless VLM Endpoint
-
-## Goal
-
-Build a Docker container that serves the `granite-docling-258M` vision model as an
-OpenAI-compatible HTTP API, then deploy it on RunPod Serverless so it auto-scales
-to zero when idle.
-
-## Context
-
-Our backend worker (`ks-backend`) already knows how to talk to this model. It sends
-HTTP requests to two endpoints:
-
-1. `GET /v1/models` — "what models are loaded?" (health check)
-2. `POST /v1/chat/completions` — "here's a page image, extract the content"
-
-Currently this runs on a Vast.ai GPU 24/7 (~$650/month). We want RunPod Serverless
-so we only pay when requests come in.
-
-### What the backend expects
-
-The backend sends these env vars to configure the connection:
-
-```
-VLM_ENDPOINT=http://<host>/v1/chat/completions   # full URL to chat completions
-VLM_MODEL=granite-docling-258M                    # model name returned by /v1/models
-VLM_API_KEY=<optional bearer token>               # sent as Authorization: Bearer <key>
-```
-
-The backend calls `GET /v1/models`, checks that the configured model name appears in
-the response list (case-insensitive), and if so, sends chat completion requests with:
-
-```json
-{
-  "model": "<VLM_MODEL>",
-  "messages": [{"role": "user", "content": "Convert this page to docling."}],
-  "temperature": 0.0,
-  "skip_special_tokens": false
-}
-```
-
-Images are sent inline in the messages as base64 data URIs (standard OpenAI vision
-format). The response must be standard OpenAI chat completion format.
-
-### VLM pipeline settings (from ks-backend ingestion_config.yaml)
-
-- `vlm_concurrency: 2` — up to 2 parallel requests per document
-- `vlm_temperature: 0.0`
-- `vlm_timeout: 300` — 5 minute timeout per request
-- `document_timeout: 7200` — 2 hour max for full document conversion
-
----
-
-## Tasks
-
-### 1. Find the right model on HuggingFace
-
-- The model is IBM's `granite-docling-258M` (a small vision-language model for
-  document understanding)
-- HuggingFace ID: look for `ds4sd/docling-granite-258M-preview` or similar under
-  the `ibm-granite` or `ds4sd` organizations
-- We need either:
-  - **The original safetensors** (if using vLLM to serve it), OR
-  - **A GGUF conversion** (if using llama.cpp to serve it)
-- The model is only 258M parameters — any modern GPU can run it
-
-### 2. Choose a serving approach
-
-Pick ONE of these. Both produce the same OpenAI-compatible API.
-
-#### Option A: RunPod's vLLM worker (recommended — least work)
-
-- RunPod has a pre-built Docker image: `runpod/worker-vllm`
-- It loads a model from HuggingFace and serves it with OpenAI-compatible endpoints
-- You configure it via environment variables (model name, HF token, etc.)
-- RunPod exposes it at: `https://api.runpod.ai/v2/{endpoint_id}/openai/v1/...`
-- Docs: https://github.com/runpod-workers/worker-vllm
-- **Verify that vLLM supports this specific model** (check vLLM's supported models
-  list for vision models)
-
-#### Option B: Custom Docker image with llama.cpp
-
-- Write a `Dockerfile` that:
-  1. Starts from a CUDA base image (e.g. `nvidia/cuda:12.4.0-runtime-ubuntu22.04`)
-  2. Installs llama.cpp (build from source or use a release binary)
-  3. Downloads the GGUF model file at build time (bake it into the image) or at
-     container startup
-  4. Runs `llama-server` which natively exposes `/v1/chat/completions` and
-     `/v1/models`
-- The startup command would be something like:
-  ```
-  llama-server --model /models/granite-docling-258M.gguf --port 8000 --host 0.0.0.0
-  ```
-- Push the image to Docker Hub (or RunPod's container registry)
-
-### 3. Deploy on RunPod Serverless
-
-1. Go to RunPod → Serverless → New Endpoint
-2. If using Option A (vLLM worker): select the vLLM template, configure the model
-3. If using Option B (custom image): point it to your Docker Hub image
-4. Configure:
-   - **GPU type**: cheapest that fits (RTX 4000/3090/4090 — model is tiny)
-   - **Active workers**: `0` (this is the whole point — zero cost when idle)
-   - **Max workers**: `1` (start with 1, increase later if needed)
-   - **Idle timeout**: `300` seconds (5 minutes — GPU shuts down after this)
-   - **Execution timeout**: `600` seconds (long enough for big PDFs)
-
-### 4. Verify the endpoint works
-
-Once deployed, test from the command line:
-
-```bash
-# Check /v1/models — should list the model name
-curl https://api.runpod.ai/v2/{endpoint_id}/openai/v1/models \
-  -H "Authorization: Bearer $RUNPOD_API_KEY"
-
-# Send a test chat completion (text-only, no image, just to verify the format)
-curl https://api.runpod.ai/v2/{endpoint_id}/openai/v1/chat/completions \
-  -H "Authorization: Bearer $RUNPOD_API_KEY" \
-  -H "Content-Type: application/json" \
-  -d '{
-    "model": "granite-docling-258M",
-    "messages": [{"role": "user", "content": "Hello"}],
-    "temperature": 0.0
-  }'
-```
-
-Both should return standard OpenAI-format JSON responses.
-
-### 5. Measure latency
-
-- **Cold start**: time the first request after the endpoint has been idle (GPU
-  spins up from zero). Run the curl above with `time` in front.
-- **Hot start**: time a second request immediately after. This is the steady-state
-  performance.
-- Record both numbers.
-
-### 6. Wire it into ks-backend for testing
-
-Update `ks-backend/.env.dev` with:
-
-```
-VLM_ENDPOINT=https://api.runpod.ai/v2/{endpoint_id}/openai/v1/chat/completions
-VLM_MODEL=granite-docling-258M
-VLM_API_KEY=<your runpod api key>
-```
-
-Then:
-1. Run `make dev-api` and `make dev-worker`
-2. Watch worker logs — look for `vlm_model_available` (success) or
-   `vlm_endpoint_unreachable` / `vlm_model_not_found` (failure)
-3. Upload a test PDF through the app
-4. Confirm VLM-powered ingestion completes
-
-### Potential issues
-
-- **Cold start vs health check timeout**: The backend's health check (`GET /v1/models`)
-  has a 10-second timeout. If cold start takes longer, the check will fail and the
-  worker falls back to the non-VLM pipeline silently. Fix: pre-warm the endpoint with
-  a manual curl before testing, or increase the timeout in
-  `ks-backend/src/worker/utils/docling.py:check_vlm_available()`.
-- **Model name mismatch**: The model name in `/v1/models` response must match
-  `VLM_MODEL` (case-insensitive). Check what name the server actually reports.
-- **RunPod URL format**: Make sure the URL path is correct. RunPod's OpenAI-compatible
-  proxy lives under `/openai/v1/...` not just `/v1/...`. The full URL would be
-  `https://api.runpod.ai/v2/{endpoint_id}/openai/v1/chat/completions`.
-
-## Definition of Done
-
-- Docker image is built and pushed (or vLLM template is configured)
-- RunPod serverless endpoint is running with active workers = 0
-- `GET /v1/models` returns the model name
-- `POST /v1/chat/completions` returns a valid response
-- Cold start and hot start times are measured and recorded
-- OR: document why it doesn't work and what blocked it
diff --git a/docs/cold-start-analysis.md b/docs/cold-start-analysis.md
new file mode 100644
index 0000000..59a8846
--- /dev/null
+++ b/docs/cold-start-analysis.md
@@ -0,0 +1,138 @@
+# Cold Start Analysis: RunPod Serverless GPU
+
+## Context
+
+We serve IBM's [granite-docling-258M](https://huggingface.co/ibm-granite/granite-docling-258M) vision-language model on a RunPod serverless endpoint for document ingestion. Our workloads are bursty — documents arrive in spikes, with idle periods in between. This analysis determines whether we need a warm pool of GPU instances or can rely on scale-to-zero.
+
+## Benchmark Results
+
+All benchmarks ran against endpoint `docling-vlm-2` using `scripts/benchmark_coldstart.sh` and simple text prompts (`max_tokens: 16`). Image-based inference benchmarks (`scripts/benchmark_inference.sh`) should be run separately for realistic per-document latency.
+
+### True Cold Start (first-ever boot, no FlashBoot cache)
+
+| Metric | Value |
+|---|---|
+| Cold start | **~80s** |
+| First inference | 0.75s |
+
+This was observed on the very first request after deploying the endpoint, before RunPod had any cached state.
+
+### FlashBoot Cold Start (0 running workers, $0.00/s billing, but recently used)
+
+| Metric | Value |
+|---|---|
+| Cold start | **~1.4s** |
+| First inference | 0.67s |
+
+Even with 0 running workers and no active billing, RunPod's FlashBoot revived a cached worker in ~1.4s. This was reproducible across multiple runs.
+
+### Warm Inference (worker already running)
+
+| Metric | Value |
+|---|---|
+| Avg latency (text, 16 tokens) | 0.71s |
+| P50 latency | 0.69s |
+| Min / Max | 0.65s / 0.80s |
+
+### Burst Test (5 concurrent text requests)
+
+| Metric | Value |
+|---|---|
+| Wall time | ~3.9s |
+| Avg per-request | 2.2s |
+| P50 | 2.6s |
+| Success rate | 5/5 |
+
+Higher per-request latency during bursts is expected — the endpoint has `MAX_CONCURRENCY=2`, so requests queue behind each other on a single worker.
+
+> **Note:** These numbers are for trivial text prompts. Real document image inference will be significantly slower due to image encoding, vision preprocessing, and longer output generation (500-2000+ tokens). Run `scripts/benchmark_inference.sh` for realistic numbers.
+
+## RunPod Worker Types
+
+| Type | Behavior | Billing | Cold Start |
+|---|---|---|---|
+| **Active Workers** | Always on, never shut down | Continuous (40% discount) | None |
+| **Flex Workers** | Spin up on demand, shut down after idle timeout | Only while running | FlashBoot or full cold start |
+
+- **Active Workers** = minimum workers always running. Set via endpoint config.
+- **Max Workers** = ceiling for autoscaling. Flex workers spin up to fill the gap between active and max.
+- **Idle Timeout** = how long a flex worker stays alive after finishing its last job (default: 5s). Worker is fully shut down after this expires.
+
+Source: [RunPod Endpoint Configurations](https://docs.runpod.io/serverless/endpoints/endpoint-configurations)
+
+## FlashBoot
+
+FlashBoot is RunPod's container caching system that reduces cold starts by retaining worker state after shutdown. It's free and enabled by default.
+
+### Key characteristics
+
+- **Probabilistic, not time-based.** There is no fixed TTL or cache duration.
+- **Decay curve:** Requesting a worker immediately after shutdown gives the highest chance of a FlashBoot hit. The probability decreases over time until eventually you get a full cold start.
+- **No guaranteed SLA.** RunPod staff confirmed: *"there isn't a fixed timeframe — it is based on the requests you have and their platform available resources."*
+- **Traffic-dependent.** Endpoints with consistent traffic get better FlashBoot hit rates. After extended idle periods, FlashBoot *"is disabled as the instance goes to a deeper sleep."*
+- **Image popularity matters.** Container images used by more RunPod customers are cached more aggressively across the platform.
+
+### What we observed
+
+| Scenario | Cold start time |
+|---|---|
+| First-ever request (no cache) | ~80s |
+| Request after ~20 min idle | ~1.4s (FlashBoot hit) |
+| Unknown: after hours/days idle | Likely 80s (FlashBoot expired) |
+
+### Sources
+
+- [Introducing FlashBoot: 1-Second Serverless Cold-Start (RunPod Blog)](https://www.runpod.io/blog/introducing-flashboot-serverless-cold-start)
+- [Keeping Flashboot active? (RunPod Discord)](https://www.answeroverflow.com/m/1293671895564161116)
+- [Flashboot not working after a while (RunPod Discord)](https://www.answeroverflow.com/m/1340825479820611624)
+- [Serverless or Regular Pod? How good is Flashboot? (RunPod Discord)](https://www.answeroverflow.com/m/1292890615922561076)
+- [Very slow cold starts with FlashBoot (GitHub Issue)](https://github.com/runpod-workers/worker-vllm/issues/111)
+
+## Recommendations
+
+### For bursty workloads with predictable patterns (e.g. business-hours ingestion)
+
+**Set Active Workers = 0, Idle Timeout = 300s.** Workers stay warm between closely-spaced bursts and shut down during long gaps. FlashBoot handles the re-warm if the gap is short enough.
+
+Optionally, send a pre-warm request (e.g. `GET /v1/models`) before kicking off a batch job to absorb the cold start outside the critical path.
+
+### For unpredictable bursts with long idle gaps (hours/days)
+
+**Set Active Workers = 1.** One worker is always warm and handles the first request instantly. Flex workers scale up for the rest of the burst. This costs more (continuous billing at 40% discount) but guarantees no cold start penalty.
+
+### For cost-sensitive, latency-tolerant workloads
+
+**Set Active Workers = 0, rely on FlashBoot.** Accept that the first request after a long gap may take ~80s. Subsequent requests in the same burst will be fast. This is the cheapest option.
+
+### Cost comparison (rough estimate)
+
+Assuming an RTX A4500 at ~$0.29/hr on RunPod serverless:
+
+| Strategy | Monthly idle cost | Cold start risk |
+|---|---|---|
+| Active Workers = 0 | $0 | 1.4s–80s (unpredictable) |
+| Active Workers = 1 | ~$210/mo | None |
+| Idle Timeout = 300s | Depends on traffic | None within 5 min of last request |
+
+Compare to the previous always-on Vast.ai GPU at **~$650/mo**.
+
+## Scripts
+
+- `scripts/benchmark_coldstart.sh` — Measures cold start, warm inference, and burst latency with simple text prompts.
+- `scripts/benchmark_inference.sh` — Measures realistic inference latency using actual document page images.
+
+### Usage
+
+```bash
+# True cold start: scale to 0 in RunPod dashboard, wait for workers to fully terminate
+./scripts/benchmark_coldstart.sh
+
+# Realistic document inference (run after endpoint is warm)
+./scripts/benchmark_inference.sh
+
+# Custom parameters
+WARM_REQUESTS=10 BURST_SIZE=10 ./scripts/benchmark_coldstart.sh
+SAMPLE_IMAGE=/path/to/your/doc.png MAX_TOKENS=4096 ./scripts/benchmark_inference.sh
+```
+
+Results are saved as timestamped JSON files in `results/` (gitignored).
diff --git a/scripts/benchmark_coldstart.sh b/scripts/benchmark_coldstart.sh
new file mode 100755
index 0000000..bc7a676
--- /dev/null
+++ b/scripts/benchmark_coldstart.sh
@@ -0,0 +1,288 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ── Cold Start & Latency Benchmark for RunPod Serverless GPU ─────────────────
+#
+# Measures:
+#   1. Cold start time   — how long until the endpoint is ready (from idle)
+#   2. First inference    — first request latency (model may still be warming)
+#   3. Warm inference     — average latency over N requests on a hot worker
+#   4. Burst throughput   — N concurrent requests to simulate bursty ingestion
+#
+# Prerequisites:
+#   - RUNPOD_API_KEY and RUNPOD_ENDPOINT_ID set in .env or environment
+#   - Endpoint should be IDLE (0 active workers) for accurate cold start measurement
+#     You can scale to 0 in the RunPod dashboard before running this.
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ENV_FILE="${SCRIPT_DIR}/../.env"
+
+if [[ -f "$ENV_FILE" ]]; then
+  set -a; source "$ENV_FILE"; set +a
+fi
+
+: "${RUNPOD_API_KEY:?Set RUNPOD_API_KEY in .env or environment}"
+: "${RUNPOD_ENDPOINT_ID:?Set RUNPOD_ENDPOINT_ID in .env or environment}"
+
+BASE_URL="https://api.runpod.ai/v2/${RUNPOD_ENDPOINT_ID}/openai/v1"
+AUTH="Authorization: Bearer ${RUNPOD_API_KEY}"
+WARM_REQUESTS=${WARM_REQUESTS:-5}
+BURST_SIZE=${BURST_SIZE:-5}
+POLL_INTERVAL=5
+MAX_WAIT=300
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+ts()   { python3 -c "import time; print(time.time())"; }
+bold() { printf "\033[1m%s\033[0m\n" "$1"; }
+info() { printf "  %s\n" "$1"; }
+ok()   { printf "\033[32m  ✓ %s\033[0m\n" "$1"; }
+err()  { printf "\033[31m  ✗ %s\033[0m\n" "$1"; }
+hr()   { echo "────────────────────────────────────────────────────────"; }
+
+RESULTS_DIR="${SCRIPT_DIR}/../results"
+mkdir -p "$RESULTS_DIR"
+RESULTS_FILE="${RESULTS_DIR}/coldstart_$(date +%Y%m%d_%H%M%S).json"
+
+cat <<BANNER
+
+$(bold "RunPod Serverless GPU — Cold Start Benchmark")
+  Endpoint:        ${RUNPOD_ENDPOINT_ID}
+  Warm requests:   ${WARM_REQUESTS}
+  Burst size:      ${BURST_SIZE}
+  Results file:    ${RESULTS_FILE}
+
+BANNER
+
+# Helper: make a simple chat request, return "elapsed_seconds http_code"
+do_request() {
+  local start end code response elapsed
+  start=$(ts)
+  response=$(curl -s -w "\n%{http_code}" \
+    "${BASE_URL}/chat/completions" \
+    -H "$AUTH" \
+    -H "Content-Type: application/json" \
+    --max-time 300 \
+    -d '{
+      "model": "'"${MODEL_NAME}"'",
+      "messages": [{"role": "user", "content": "Hello"}],
+      "temperature": 0.0,
+      "max_tokens": 16
+    }' 2>&1) || true
+  end=$(ts)
+  code=$(echo "$response" | tail -1)
+  elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')")
+  echo "${elapsed} ${code}"
+}
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 1: Cold Start — poll /v1/models until ready
+# ══════════════════════════════════════════════════════════════════════════════
+hr
+bold "Phase 1: Cold Start (polling /v1/models)"
+info "Tip: scale endpoint to 0 workers first for an accurate measurement."
+echo ""
+
+COLD_START_BEGIN=$(ts)
+WAITED=0
+READY=false
+
+while (( WAITED < MAX_WAIT )); do
+  CODE=$(curl -s -o /dev/null -w "%{http_code}" \
+    "${BASE_URL}/models" \
+    -H "$AUTH" \
+    --max-time 30 2>/dev/null) || CODE="000"
+
+  if [[ "$CODE" == "200" ]]; then
+    READY=true
+    break
+  fi
+
+  WAITED=$((WAITED + POLL_INTERVAL))
+  printf "\r  ⏳ %3ds — HTTP %s" "$WAITED" "$CODE"
+  sleep "$POLL_INTERVAL"
+done
+echo ""
+
+COLD_START_END=$(ts)
+COLD_START_SECS=$(python3 -c "print(f'{${COLD_START_END} - ${COLD_START_BEGIN}:.1f}')")
+
+if $READY; then
+  ok "Endpoint ready in ${COLD_START_SECS}s"
+else
+  err "Endpoint not ready after ${MAX_WAIT}s — aborting"
+  exit 1
+fi
+
+# Discover model name
+MODEL_NAME=$(curl -s "${BASE_URL}/models" -H "$AUTH" \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['data'][0]['id'])" 2>/dev/null \
+  || echo "granite-docling-258M")
+info "Model: ${MODEL_NAME}"
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 2: First Inference (may include model warm-up overhead)
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+hr
+bold "Phase 2: First Inference"
+
+read -r FIRST_LATENCY FIRST_CODE <<< "$(do_request)"
+
+if [[ "$FIRST_CODE" == "200" ]]; then
+  ok "First inference: ${FIRST_LATENCY}s"
+else
+  err "First inference failed: HTTP ${FIRST_CODE} (${FIRST_LATENCY}s)"
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 3: Warm Inference — sequential requests on a hot worker
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+hr
+bold "Phase 3: Warm Inference (${WARM_REQUESTS} sequential requests)"
+
+WARM_TOTAL=0
+WARM_MIN=999999
+WARM_MAX=0
+WARM_FAILURES=0
+WARM_LATENCIES=""
+
+for i in $(seq 1 "$WARM_REQUESTS"); do
+  read -r LAT CODE <<< "$(do_request)"
+  if [[ "$CODE" == "200" ]]; then
+    printf "  [%d/%d] %ss\n" "$i" "$WARM_REQUESTS" "$LAT"
+    WARM_LATENCIES="${WARM_LATENCIES} ${LAT}"
+    WARM_TOTAL=$(python3 -c "print(${WARM_TOTAL} + ${LAT})")
+    WARM_MIN=$(python3 -c "print(min(${WARM_MIN}, ${LAT}))")
+    WARM_MAX=$(python3 -c "print(max(${WARM_MAX}, ${LAT}))")
+  else
+    err "[${i}/${WARM_REQUESTS}] HTTP ${CODE} (${LAT}s)"
+    WARM_FAILURES=$((WARM_FAILURES + 1))
+  fi
+done
+
+WARM_SUCCESS=$((WARM_REQUESTS - WARM_FAILURES))
+if (( WARM_SUCCESS > 0 )); then
+  WARM_AVG=$(python3 -c "print(f'{${WARM_TOTAL} / ${WARM_SUCCESS}:.3f}')")
+  WARM_P50=$(python3 -c "
+import statistics
+lats = [float(x) for x in '${WARM_LATENCIES}'.split()]
+print(f'{statistics.median(lats):.3f}')
+")
+  echo ""
+  ok "Avg: ${WARM_AVG}s | P50: ${WARM_P50}s | Min: ${WARM_MIN}s | Max: ${WARM_MAX}s"
+  if (( WARM_FAILURES > 0 )); then
+    err "${WARM_FAILURES}/${WARM_REQUESTS} requests failed"
+  fi
+else
+  WARM_AVG="N/A"; WARM_P50="N/A"; WARM_MIN="N/A"; WARM_MAX="N/A"
+  err "All warm requests failed"
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 4: Burst Test — concurrent requests to simulate ingestion spike
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+hr
+bold "Phase 4: Burst Test (${BURST_SIZE} concurrent requests)"
+
+BURST_DIR=$(mktemp -d /tmp/burst_XXXX)
+BURST_START=$(ts)
+
+for i in $(seq 1 "$BURST_SIZE"); do
+  (
+    start=$(ts)
+    code=$(curl -s -o /dev/null -w "%{http_code}" \
+      "${BASE_URL}/chat/completions" \
+      -H "$AUTH" \
+      -H "Content-Type: application/json" \
+      --max-time 300 \
+      -d '{
+        "model": "'"${MODEL_NAME}"'",
+        "messages": [{"role": "user", "content": "Hello"}],
+        "temperature": 0.0,
+        "max_tokens": 16
+      }' 2>/dev/null) || code="000"
+    end=$(ts)
+    elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')")
+    echo "${elapsed} ${code}" > "${BURST_DIR}/${i}.txt"
+  ) &
+done
+
+wait
+BURST_END=$(ts)
+BURST_WALL=$(python3 -c "print(f'{${BURST_END} - ${BURST_START}:.3f}')")
+
+BURST_LATS=""
+BURST_OK=0
+BURST_FAIL=0
+
+for f in "${BURST_DIR}"/*.txt; do
+  read -r LAT CODE < "$f"
+  if [[ "$CODE" == "200" ]]; then
+    BURST_LATS="${BURST_LATS} ${LAT}"
+    BURST_OK=$((BURST_OK + 1))
+    info "[ok] ${LAT}s"
+  else
+    BURST_FAIL=$((BURST_FAIL + 1))
+    err "[HTTP ${CODE}] ${LAT}s"
+  fi
+done
+
+rm -rf "$BURST_DIR"
+
+echo ""
+ok "Wall time: ${BURST_WALL}s | ${BURST_OK}/${BURST_SIZE} succeeded"
+
+if (( BURST_OK > 0 )); then
+  BURST_STATS=$(python3 -c "
+import statistics
+lats = [float(x) for x in '${BURST_LATS}'.split()]
+print(f'Avg: {statistics.mean(lats):.3f}s | P50: {statistics.median(lats):.3f}s | Max: {max(lats):.3f}s')
+")
+  ok "${BURST_STATS}"
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Summary
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+hr
+bold "Summary"
+echo ""
+printf "  %-25s %s\n" "Cold start:" "${COLD_START_SECS}s"
+printf "  %-25s %s\n" "First inference:" "${FIRST_LATENCY}s"
+printf "  %-25s %s\n" "Warm avg (${WARM_SUCCESS} reqs):" "${WARM_AVG}s"
+printf "  %-25s %s\n" "Warm P50:" "${WARM_P50}s"
+printf "  %-25s %s\n" "Burst wall (${BURST_SIZE} reqs):" "${BURST_WALL}s"
+echo ""
+
+# ── Write JSON results ───────────────────────────────────────────────────────
+python3 -c "
+import json, datetime
+results = {
+    'timestamp': datetime.datetime.utcnow().isoformat() + 'Z',
+    'endpoint_id': '${RUNPOD_ENDPOINT_ID}',
+    'model': '${MODEL_NAME}',
+    'cold_start_secs': ${COLD_START_SECS},
+    'first_inference_secs': ${FIRST_LATENCY},
+    'warm': {
+        'requests': ${WARM_SUCCESS},
+        'avg_secs': ${WARM_AVG} if '${WARM_AVG}' != 'N/A' else None,
+        'p50_secs': ${WARM_P50} if '${WARM_P50}' != 'N/A' else None,
+        'min_secs': ${WARM_MIN} if '${WARM_MIN}' != 'N/A' else None,
+        'max_secs': ${WARM_MAX} if '${WARM_MAX}' != 'N/A' else None,
+    },
+    'burst': {
+        'concurrency': ${BURST_SIZE},
+        'wall_secs': ${BURST_WALL},
+        'succeeded': ${BURST_OK},
+        'failed': ${BURST_FAIL},
+    }
+}
+with open('${RESULTS_FILE}', 'w') as f:
+    json.dump(results, f, indent=2)
+"
+
+ok "Results written to ${RESULTS_FILE}"
+echo ""
diff --git a/scripts/benchmark_inference.sh b/scripts/benchmark_inference.sh
new file mode 100755
index 0000000..e525903
--- /dev/null
+++ b/scripts/benchmark_inference.sh
@@ -0,0 +1,341 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# ── Inference Latency Benchmark (with real document image) ───────────────────
+#
+# Measures realistic inference times by sending an actual document page image
+# to the endpoint, simulating real ingestion workloads.
+#
+# Measures:
+#   1. Image download + base64 encoding overhead
+#   2. Single document inference (full output)
+#   3. Warm sequential inference (N requests)
+#   4. Burst concurrent inference (N parallel requests)
+#
+# Prerequisites:
+#   - RUNPOD_API_KEY and RUNPOD_ENDPOINT_ID in .env or environment
+#   - Endpoint should already be WARM (run benchmark_coldstart.sh first)
+#
+# Usage:
+#   ./scripts/benchmark_inference.sh                          # defaults
+#   WARM_REQUESTS=10 BURST_SIZE=10 ./scripts/benchmark_inference.sh
+#   SAMPLE_IMAGE=/path/to/your/doc.png ./scripts/benchmark_inference.sh
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ENV_FILE="${SCRIPT_DIR}/../.env"
+
+if [[ -f "$ENV_FILE" ]]; then
+  set -a; source "$ENV_FILE"; set +a
+fi
+
+: "${RUNPOD_API_KEY:?Set RUNPOD_API_KEY in .env or environment}"
+: "${RUNPOD_ENDPOINT_ID:?Set RUNPOD_ENDPOINT_ID in .env or environment}"
+
+BASE_URL="https://api.runpod.ai/v2/${RUNPOD_ENDPOINT_ID}/openai/v1"
+AUTH="Authorization: Bearer ${RUNPOD_API_KEY}"
+WARM_REQUESTS=${WARM_REQUESTS:-3}
+BURST_SIZE=${BURST_SIZE:-3}
+MAX_TOKENS=${MAX_TOKENS:-4096}
+
+# ── Helpers ──────────────────────────────────────────────────────────────────
+ts()   { python3 -c "import time; print(time.time())"; }
+bold() { printf "\033[1m%s\033[0m\n" "$1"; }
+info() { printf "  %s\n" "$1"; }
+ok()   { printf "\033[32m  ✓ %s\033[0m\n" "$1"; }
+err()  { printf "\033[31m  ✗ %s\033[0m\n" "$1"; }
+hr()   { echo "────────────────────────────────────────────────────────"; }
+
+RESULTS_DIR="${SCRIPT_DIR}/../results"
+mkdir -p "$RESULTS_DIR"
+RESULTS_FILE="${RESULTS_DIR}/inference_$(date +%Y%m%d_%H%M%S).json"
+
+# ── Prepare sample document image ────────────────────────────────────────────
+hr
+bold "Preparing document image"
+
+SAMPLE_IMG_URL="https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/examples/input/wikipedia_example.png"
+
+if [[ -n "${SAMPLE_IMAGE:-}" && -f "${SAMPLE_IMAGE}" ]]; then
+  info "Using provided image: ${SAMPLE_IMAGE}"
+  SAMPLE_B64=$(base64 < "$SAMPLE_IMAGE")
+  IMG_SIZE_KB=$(( $(wc -c < "$SAMPLE_IMAGE") / 1024 ))
+else
+  info "Downloading sample from HuggingFace..."
+  TMP_IMG=$(mktemp /tmp/sample_page_XXXX.png)
+  if curl -sL -o "$TMP_IMG" "$SAMPLE_IMG_URL" && [[ -s "$TMP_IMG" ]]; then
+    SAMPLE_B64=$(base64 < "$TMP_IMG")
+    IMG_SIZE_KB=$(( $(wc -c < "$TMP_IMG") / 1024 ))
+    rm -f "$TMP_IMG"
+  else
+    err "Could not download sample image. Provide one via SAMPLE_IMAGE=/path/to/doc.png"
+    rm -f "$TMP_IMG"
+    exit 1
+  fi
+fi
+
+B64_SIZE_KB=$(( ${#SAMPLE_B64} / 1024 ))
+ok "Image ready (${IMG_SIZE_KB} KB raw, ${B64_SIZE_KB} KB base64)"
+
+# ── Check endpoint is warm ──────────────────────────────────────────────────
+echo ""
+hr
+bold "Checking endpoint is warm"
+
+CODE=$(curl -s -o /dev/null -w "%{http_code}" \
+  "${BASE_URL}/models" -H "$AUTH" --max-time 15 2>/dev/null) || CODE="000"
+
+if [[ "$CODE" == "200" ]]; then
+  ok "Endpoint is ready"
+else
+  err "Endpoint returned HTTP ${CODE} — run benchmark_coldstart.sh first to warm it up"
+  exit 1
+fi
+
+MODEL_NAME=$(curl -s "${BASE_URL}/models" -H "$AUTH" \
+  | python3 -c "import sys,json; print(json.load(sys.stdin)['data'][0]['id'])" 2>/dev/null \
+  || echo "granite-docling-258M")
+info "Model: ${MODEL_NAME}"
+
+# ── Build the request payload ────────────────────────────────────────────────
+# This matches how granite-docling is used: send a page image, get docling markup
+PAYLOAD_FILE=$(mktemp /tmp/bench_payload_XXXX.json)
+python3 -c "
+import json
+payload = {
+    'model': '${MODEL_NAME}',
+    'messages': [{
+        'role': 'user',
+        'content': [
+            {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,${SAMPLE_B64}'}},
+            {'type': 'text', 'text': 'Convert this page to docling markup.'}
+        ]
+    }],
+    'temperature': 0.0,
+    'max_tokens': ${MAX_TOKENS}
+}
+# Write without the base64 in logs
+with open('${PAYLOAD_FILE}', 'w') as f:
+    json.dump(payload, f)
+"
+PAYLOAD_SIZE_KB=$(( $(wc -c < "$PAYLOAD_FILE") / 1024 ))
+info "Payload size: ${PAYLOAD_SIZE_KB} KB"
+
+cat <<BANNER
+
+$(bold "RunPod Serverless GPU — Inference Benchmark (Document Image)")
+  Endpoint:        ${RUNPOD_ENDPOINT_ID}
+  Model:           ${MODEL_NAME}
+  Image:           ${IMG_SIZE_KB} KB (${B64_SIZE_KB} KB base64)
+  Payload:         ${PAYLOAD_SIZE_KB} KB
+  Max tokens:      ${MAX_TOKENS}
+  Warm requests:   ${WARM_REQUESTS}
+  Burst size:      ${BURST_SIZE}
+  Results file:    ${RESULTS_FILE}
+
+BANNER
+
+# Helper: send image inference request, return "elapsed_seconds http_code output_tokens"
+do_image_request() {
+  local start end code response elapsed body tokens
+  start=$(ts)
+  response=$(curl -s -w "\n%{http_code}" \
+    "${BASE_URL}/chat/completions" \
+    -H "$AUTH" \
+    -H "Content-Type: application/json" \
+    --max-time 600 \
+    -d @"${PAYLOAD_FILE}" 2>&1) || true
+  end=$(ts)
+  code=$(echo "$response" | tail -1)
+  body=$(echo "$response" | sed '$d')
+  elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')")
+  tokens=$(echo "$body" | python3 -c "
+import sys, json
+try:
+    r = json.load(sys.stdin)
+    print(r.get('usage', {}).get('completion_tokens', r.get('usage', {}).get('total_tokens', '?')))
+except: print('?')
+" 2>/dev/null)
+  echo "${elapsed} ${code} ${tokens}"
+}
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 1: First Document Inference
+# ══════════════════════════════════════════════════════════════════════════════
+hr
+bold "Phase 1: First Document Inference"
+
+read -r FIRST_LAT FIRST_CODE FIRST_TOKENS <<< "$(do_image_request)"
+
+if [[ "$FIRST_CODE" == "200" ]]; then
+  ok "First inference: ${FIRST_LAT}s (${FIRST_TOKENS} tokens)"
+else
+  err "First inference failed: HTTP ${FIRST_CODE} (${FIRST_LAT}s)"
+  # Show error body for debugging
+  curl -s "${BASE_URL}/chat/completions" \
+    -H "$AUTH" -H "Content-Type: application/json" \
+    --max-time 60 -d @"${PAYLOAD_FILE}" 2>&1 | python3 -c "
+import sys, json
+try: print(json.dumps(json.load(sys.stdin), indent=2)[:500])
+except: print(sys.stdin.read()[:500])
+" || true
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 2: Warm Sequential Inference
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+hr
+bold "Phase 2: Warm Inference (${WARM_REQUESTS} sequential document requests)"
+
+WARM_TOTAL=0
+WARM_MIN=999999
+WARM_MAX=0
+WARM_FAILURES=0
+WARM_LATENCIES=""
+WARM_TOKENS_TOTAL=0
+
+for i in $(seq 1 "$WARM_REQUESTS"); do
+  read -r LAT CODE TOKENS <<< "$(do_image_request)"
+  if [[ "$CODE" == "200" ]]; then
+    printf "  [%d/%d] %ss (%s tokens)\n" "$i" "$WARM_REQUESTS" "$LAT" "$TOKENS"
+    WARM_LATENCIES="${WARM_LATENCIES} ${LAT}"
+    WARM_TOTAL=$(python3 -c "print(${WARM_TOTAL} + ${LAT})")
+    WARM_MIN=$(python3 -c "print(min(${WARM_MIN}, ${LAT}))")
+    WARM_MAX=$(python3 -c "print(max(${WARM_MAX}, ${LAT}))")
+    if [[ "$TOKENS" != "?" ]]; then
+      WARM_TOKENS_TOTAL=$((WARM_TOKENS_TOTAL + TOKENS))
+    fi
+  else
+    err "[${i}/${WARM_REQUESTS}] HTTP ${CODE} (${LAT}s)"
+    WARM_FAILURES=$((WARM_FAILURES + 1))
+  fi
+done
+
+WARM_SUCCESS=$((WARM_REQUESTS - WARM_FAILURES))
+if (( WARM_SUCCESS > 0 )); then
+  WARM_AVG=$(python3 -c "print(f'{${WARM_TOTAL} / ${WARM_SUCCESS}:.3f}')")
+  WARM_P50=$(python3 -c "
+import statistics
+lats = [float(x) for x in '${WARM_LATENCIES}'.split()]
+print(f'{statistics.median(lats):.3f}')
+")
+  WARM_TOKENS_AVG=$((WARM_TOKENS_TOTAL / WARM_SUCCESS))
+  echo ""
+  ok "Avg: ${WARM_AVG}s | P50: ${WARM_P50}s | Min: ${WARM_MIN}s | Max: ${WARM_MAX}s"
+  ok "Avg tokens/request: ~${WARM_TOKENS_AVG}"
+else
+  WARM_AVG="N/A"; WARM_P50="N/A"; WARM_MIN="N/A"; WARM_MAX="N/A"; WARM_TOKENS_AVG=0
+  err "All warm requests failed"
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Phase 3: Burst Test — concurrent document requests
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+hr
+bold "Phase 3: Burst Test (${BURST_SIZE} concurrent document requests)"
+
+BURST_DIR=$(mktemp -d /tmp/burst_img_XXXX)
+BURST_START=$(ts)
+
+for i in $(seq 1 "$BURST_SIZE"); do
+  (
+    start=$(ts)
+    response=$(curl -s -w "\n%{http_code}" \
+      "${BASE_URL}/chat/completions" \
+      -H "$AUTH" \
+      -H "Content-Type: application/json" \
+      --max-time 600 \
+      -d @"${PAYLOAD_FILE}" 2>&1) || true
+    end=$(ts)
+    code=$(echo "$response" | tail -1)
+    elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')")
+    echo "${elapsed} ${code}" > "${BURST_DIR}/${i}.txt"
+  ) &
+done
+
+wait
+BURST_END=$(ts)
+BURST_WALL=$(python3 -c "print(f'{${BURST_END} - ${BURST_START}:.3f}')")
+
+BURST_LATS=""
+BURST_OK=0
+BURST_FAIL=0
+
+for f in "${BURST_DIR}"/*.txt; do
+  read -r LAT CODE < "$f"
+  if [[ "$CODE" == "200" ]]; then
+    BURST_LATS="${BURST_LATS} ${LAT}"
+    BURST_OK=$((BURST_OK + 1))
+    info "[ok] ${LAT}s"
+  else
+    BURST_FAIL=$((BURST_FAIL + 1))
+    err "[HTTP ${CODE}] ${LAT}s"
+  fi
+done
+
+rm -rf "$BURST_DIR"
+
+echo ""
+ok "Wall time: ${BURST_WALL}s | ${BURST_OK}/${BURST_SIZE} succeeded"
+
+if (( BURST_OK > 0 )); then
+  BURST_STATS=$(python3 -c "
+import statistics
+lats = [float(x) for x in '${BURST_LATS}'.split()]
+print(f'Avg: {statistics.mean(lats):.3f}s | P50: {statistics.median(lats):.3f}s | Max: {max(lats):.3f}s')
+")
+  ok "${BURST_STATS}"
+fi
+
+# ══════════════════════════════════════════════════════════════════════════════
+# Summary
+# ══════════════════════════════════════════════════════════════════════════════
+echo ""
+hr
+bold "Summary"
+echo ""
+printf "  %-30s %s\n" "First doc inference:" "${FIRST_LAT}s (${FIRST_TOKENS} tokens)"
+printf "  %-30s %s\n" "Warm avg (${WARM_SUCCESS} reqs):" "${WARM_AVG}s"
+printf "  %-30s %s\n" "Warm P50:" "${WARM_P50}s"
+printf "  %-30s %s\n" "Warm min/max:" "${WARM_MIN}s / ${WARM_MAX}s"
+printf "  %-30s %s\n" "Avg tokens/request:" "~${WARM_TOKENS_AVG}"
+printf "  %-30s %s\n" "Burst wall (${BURST_SIZE} reqs):" "${BURST_WALL}s"
+echo ""
+
+# ── Write JSON results ───────────────────────────────────────────────────────
+python3 -c "
+import json, datetime
+results = {
+    'timestamp': datetime.datetime.utcnow().isoformat() + 'Z',
+    'endpoint_id': '${RUNPOD_ENDPOINT_ID}',
+    'model': '${MODEL_NAME}',
+    'image_size_kb': ${IMG_SIZE_KB},
+    'payload_size_kb': ${PAYLOAD_SIZE_KB},
+    'max_tokens': ${MAX_TOKENS},
+    'first_inference_secs': ${FIRST_LAT},
+    'first_inference_tokens': '${FIRST_TOKENS}',
+    'warm': {
+        'requests': ${WARM_SUCCESS},
+        'avg_secs': ${WARM_AVG} if '${WARM_AVG}' != 'N/A' else None,
+        'p50_secs': ${WARM_P50} if '${WARM_P50}' != 'N/A' else None,
+        'min_secs': ${WARM_MIN} if '${WARM_MIN}' != 'N/A' else None,
+        'max_secs': ${WARM_MAX} if '${WARM_MAX}' != 'N/A' else None,
+        'avg_tokens': ${WARM_TOKENS_AVG},
+    },
+    'burst': {
+        'concurrency': ${BURST_SIZE},
+        'wall_secs': ${BURST_WALL},
+        'succeeded': ${BURST_OK},
+        'failed': ${BURST_FAIL},
+    }
+}
+with open('${RESULTS_FILE}', 'w') as f:
+    json.dump(results, f, indent=2)
+"
+
+ok "Results written to ${RESULTS_FILE}"
+
+# ── Cleanup ──────────────────────────────────────────────────────────────────
+rm -f "$PAYLOAD_FILE"
+echo ""
diff --git a/scripts/test_endpoint.sh b/scripts/test_endpoint.sh
index 092ca97..507f8be 100755
--- a/scripts/test_endpoint.sh
+++ b/scripts/test_endpoint.sh
@@ -12,7 +12,7 @@ fi
 : "${RUNPOD_API_KEY:?Set RUNPOD_API_KEY in .env or environment}"
 : "${RUNPOD_ENDPOINT_ID:?Set RUNPOD_ENDPOINT_ID in .env or environment}"
 
-BASE_URL="https://${RUNPOD_ENDPOINT_ID}.api.runpod.ai/openai/v1"
+BASE_URL="https://api.runpod.ai/v2/${RUNPOD_ENDPOINT_ID}/openai/v1"
 AUTH="Authorization: Bearer ${RUNPOD_API_KEY}"
 
 # ── Helpers ──────────────────────────────────────────────────────────────────