From abb3ddbd7d69be79c07d5e9a826cef8b8b07f1ac Mon Sep 17 00:00:00 2001 From: Saeid Alizadeh Date: Sun, 8 Mar 2026 10:46:21 -0600 Subject: [PATCH] scripts --- .gitignore | 3 +- Dockerfile | 1 + TASKS.md | 175 ----------------- docs/cold-start-analysis.md | 138 +++++++++++++ scripts/benchmark_coldstart.sh | 288 ++++++++++++++++++++++++++++ scripts/benchmark_inference.sh | 341 +++++++++++++++++++++++++++++++++ scripts/test_endpoint.sh | 2 +- 7 files changed, 771 insertions(+), 177 deletions(-) delete mode 100644 TASKS.md create mode 100644 docs/cold-start-analysis.md create mode 100755 scripts/benchmark_coldstart.sh create mode 100755 scripts/benchmark_inference.sh diff --git a/.gitignore b/.gitignore index 2eea525..c8db7c0 100644 --- a/.gitignore +++ b/.gitignore @@ -1 +1,2 @@ -.env \ No newline at end of file +.env +results/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index ba1bf8a..2ab89e3 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,3 +1,4 @@ +# TODO: this might be slow. llama cpp version might be faster FROM runpod/worker-v1-vllm:v2.13.1 ENV MODEL_NAME="ibm-granite/granite-docling-258M" diff --git a/TASKS.md b/TASKS.md deleted file mode 100644 index e136356..0000000 --- a/TASKS.md +++ /dev/null @@ -1,175 +0,0 @@ -# Task: RunPod Serverless VLM Endpoint - -## Goal - -Build a Docker container that serves the `granite-docling-258M` vision model as an -OpenAI-compatible HTTP API, then deploy it on RunPod Serverless so it auto-scales -to zero when idle. - -## Context - -Our backend worker (`ks-backend`) already knows how to talk to this model. It sends -HTTP requests to two endpoints: - -1. `GET /v1/models` — "what models are loaded?" (health check) -2. `POST /v1/chat/completions` — "here's a page image, extract the content" - -Currently this runs on a Vast.ai GPU 24/7 (~$650/month). We want RunPod Serverless -so we only pay when requests come in. - -### What the backend expects - -The backend sends these env vars to configure the connection: - -``` -VLM_ENDPOINT=http:///v1/chat/completions # full URL to chat completions -VLM_MODEL=granite-docling-258M # model name returned by /v1/models -VLM_API_KEY= # sent as Authorization: Bearer -``` - -The backend calls `GET /v1/models`, checks that the configured model name appears in -the response list (case-insensitive), and if so, sends chat completion requests with: - -```json -{ - "model": "", - "messages": [{"role": "user", "content": "Convert this page to docling."}], - "temperature": 0.0, - "skip_special_tokens": false -} -``` - -Images are sent inline in the messages as base64 data URIs (standard OpenAI vision -format). The response must be standard OpenAI chat completion format. - -### VLM pipeline settings (from ks-backend ingestion_config.yaml) - -- `vlm_concurrency: 2` — up to 2 parallel requests per document -- `vlm_temperature: 0.0` -- `vlm_timeout: 300` — 5 minute timeout per request -- `document_timeout: 7200` — 2 hour max for full document conversion - ---- - -## Tasks - -### 1. Find the right model on HuggingFace - -- The model is IBM's `granite-docling-258M` (a small vision-language model for - document understanding) -- HuggingFace ID: look for `ds4sd/docling-granite-258M-preview` or similar under - the `ibm-granite` or `ds4sd` organizations -- We need either: - - **The original safetensors** (if using vLLM to serve it), OR - - **A GGUF conversion** (if using llama.cpp to serve it) -- The model is only 258M parameters — any modern GPU can run it - -### 2. Choose a serving approach - -Pick ONE of these. Both produce the same OpenAI-compatible API. - -#### Option A: RunPod's vLLM worker (recommended — least work) - -- RunPod has a pre-built Docker image: `runpod/worker-vllm` -- It loads a model from HuggingFace and serves it with OpenAI-compatible endpoints -- You configure it via environment variables (model name, HF token, etc.) -- RunPod exposes it at: `https://api.runpod.ai/v2/{endpoint_id}/openai/v1/...` -- Docs: https://github.com/runpod-workers/worker-vllm -- **Verify that vLLM supports this specific model** (check vLLM's supported models - list for vision models) - -#### Option B: Custom Docker image with llama.cpp - -- Write a `Dockerfile` that: - 1. Starts from a CUDA base image (e.g. `nvidia/cuda:12.4.0-runtime-ubuntu22.04`) - 2. Installs llama.cpp (build from source or use a release binary) - 3. Downloads the GGUF model file at build time (bake it into the image) or at - container startup - 4. Runs `llama-server` which natively exposes `/v1/chat/completions` and - `/v1/models` -- The startup command would be something like: - ``` - llama-server --model /models/granite-docling-258M.gguf --port 8000 --host 0.0.0.0 - ``` -- Push the image to Docker Hub (or RunPod's container registry) - -### 3. Deploy on RunPod Serverless - -1. Go to RunPod → Serverless → New Endpoint -2. If using Option A (vLLM worker): select the vLLM template, configure the model -3. If using Option B (custom image): point it to your Docker Hub image -4. Configure: - - **GPU type**: cheapest that fits (RTX 4000/3090/4090 — model is tiny) - - **Active workers**: `0` (this is the whole point — zero cost when idle) - - **Max workers**: `1` (start with 1, increase later if needed) - - **Idle timeout**: `300` seconds (5 minutes — GPU shuts down after this) - - **Execution timeout**: `600` seconds (long enough for big PDFs) - -### 4. Verify the endpoint works - -Once deployed, test from the command line: - -```bash -# Check /v1/models — should list the model name -curl https://api.runpod.ai/v2/{endpoint_id}/openai/v1/models \ - -H "Authorization: Bearer $RUNPOD_API_KEY" - -# Send a test chat completion (text-only, no image, just to verify the format) -curl https://api.runpod.ai/v2/{endpoint_id}/openai/v1/chat/completions \ - -H "Authorization: Bearer $RUNPOD_API_KEY" \ - -H "Content-Type: application/json" \ - -d '{ - "model": "granite-docling-258M", - "messages": [{"role": "user", "content": "Hello"}], - "temperature": 0.0 - }' -``` - -Both should return standard OpenAI-format JSON responses. - -### 5. Measure latency - -- **Cold start**: time the first request after the endpoint has been idle (GPU - spins up from zero). Run the curl above with `time` in front. -- **Hot start**: time a second request immediately after. This is the steady-state - performance. -- Record both numbers. - -### 6. Wire it into ks-backend for testing - -Update `ks-backend/.env.dev` with: - -``` -VLM_ENDPOINT=https://api.runpod.ai/v2/{endpoint_id}/openai/v1/chat/completions -VLM_MODEL=granite-docling-258M -VLM_API_KEY= -``` - -Then: -1. Run `make dev-api` and `make dev-worker` -2. Watch worker logs — look for `vlm_model_available` (success) or - `vlm_endpoint_unreachable` / `vlm_model_not_found` (failure) -3. Upload a test PDF through the app -4. Confirm VLM-powered ingestion completes - -### Potential issues - -- **Cold start vs health check timeout**: The backend's health check (`GET /v1/models`) - has a 10-second timeout. If cold start takes longer, the check will fail and the - worker falls back to the non-VLM pipeline silently. Fix: pre-warm the endpoint with - a manual curl before testing, or increase the timeout in - `ks-backend/src/worker/utils/docling.py:check_vlm_available()`. -- **Model name mismatch**: The model name in `/v1/models` response must match - `VLM_MODEL` (case-insensitive). Check what name the server actually reports. -- **RunPod URL format**: Make sure the URL path is correct. RunPod's OpenAI-compatible - proxy lives under `/openai/v1/...` not just `/v1/...`. The full URL would be - `https://api.runpod.ai/v2/{endpoint_id}/openai/v1/chat/completions`. - -## Definition of Done - -- Docker image is built and pushed (or vLLM template is configured) -- RunPod serverless endpoint is running with active workers = 0 -- `GET /v1/models` returns the model name -- `POST /v1/chat/completions` returns a valid response -- Cold start and hot start times are measured and recorded -- OR: document why it doesn't work and what blocked it diff --git a/docs/cold-start-analysis.md b/docs/cold-start-analysis.md new file mode 100644 index 0000000..59a8846 --- /dev/null +++ b/docs/cold-start-analysis.md @@ -0,0 +1,138 @@ +# Cold Start Analysis: RunPod Serverless GPU + +## Context + +We serve IBM's [granite-docling-258M](https://huggingface.co/ibm-granite/granite-docling-258M) vision-language model on a RunPod serverless endpoint for document ingestion. Our workloads are bursty — documents arrive in spikes, with idle periods in between. This analysis determines whether we need a warm pool of GPU instances or can rely on scale-to-zero. + +## Benchmark Results + +All benchmarks ran against endpoint `docling-vlm-2` using `scripts/benchmark_coldstart.sh` and simple text prompts (`max_tokens: 16`). Image-based inference benchmarks (`scripts/benchmark_inference.sh`) should be run separately for realistic per-document latency. + +### True Cold Start (first-ever boot, no FlashBoot cache) + +| Metric | Value | +|---|---| +| Cold start | **~80s** | +| First inference | 0.75s | + +This was observed on the very first request after deploying the endpoint, before RunPod had any cached state. + +### FlashBoot Cold Start (0 running workers, $0.00/s billing, but recently used) + +| Metric | Value | +|---|---| +| Cold start | **~1.4s** | +| First inference | 0.67s | + +Even with 0 running workers and no active billing, RunPod's FlashBoot revived a cached worker in ~1.4s. This was reproducible across multiple runs. + +### Warm Inference (worker already running) + +| Metric | Value | +|---|---| +| Avg latency (text, 16 tokens) | 0.71s | +| P50 latency | 0.69s | +| Min / Max | 0.65s / 0.80s | + +### Burst Test (5 concurrent text requests) + +| Metric | Value | +|---|---| +| Wall time | ~3.9s | +| Avg per-request | 2.2s | +| P50 | 2.6s | +| Success rate | 5/5 | + +Higher per-request latency during bursts is expected — the endpoint has `MAX_CONCURRENCY=2`, so requests queue behind each other on a single worker. + +> **Note:** These numbers are for trivial text prompts. Real document image inference will be significantly slower due to image encoding, vision preprocessing, and longer output generation (500-2000+ tokens). Run `scripts/benchmark_inference.sh` for realistic numbers. + +## RunPod Worker Types + +| Type | Behavior | Billing | Cold Start | +|---|---|---|---| +| **Active Workers** | Always on, never shut down | Continuous (40% discount) | None | +| **Flex Workers** | Spin up on demand, shut down after idle timeout | Only while running | FlashBoot or full cold start | + +- **Active Workers** = minimum workers always running. Set via endpoint config. +- **Max Workers** = ceiling for autoscaling. Flex workers spin up to fill the gap between active and max. +- **Idle Timeout** = how long a flex worker stays alive after finishing its last job (default: 5s). Worker is fully shut down after this expires. + +Source: [RunPod Endpoint Configurations](https://docs.runpod.io/serverless/endpoints/endpoint-configurations) + +## FlashBoot + +FlashBoot is RunPod's container caching system that reduces cold starts by retaining worker state after shutdown. It's free and enabled by default. + +### Key characteristics + +- **Probabilistic, not time-based.** There is no fixed TTL or cache duration. +- **Decay curve:** Requesting a worker immediately after shutdown gives the highest chance of a FlashBoot hit. The probability decreases over time until eventually you get a full cold start. +- **No guaranteed SLA.** RunPod staff confirmed: *"there isn't a fixed timeframe — it is based on the requests you have and their platform available resources."* +- **Traffic-dependent.** Endpoints with consistent traffic get better FlashBoot hit rates. After extended idle periods, FlashBoot *"is disabled as the instance goes to a deeper sleep."* +- **Image popularity matters.** Container images used by more RunPod customers are cached more aggressively across the platform. + +### What we observed + +| Scenario | Cold start time | +|---|---| +| First-ever request (no cache) | ~80s | +| Request after ~20 min idle | ~1.4s (FlashBoot hit) | +| Unknown: after hours/days idle | Likely 80s (FlashBoot expired) | + +### Sources + +- [Introducing FlashBoot: 1-Second Serverless Cold-Start (RunPod Blog)](https://www.runpod.io/blog/introducing-flashboot-serverless-cold-start) +- [Keeping Flashboot active? (RunPod Discord)](https://www.answeroverflow.com/m/1293671895564161116) +- [Flashboot not working after a while (RunPod Discord)](https://www.answeroverflow.com/m/1340825479820611624) +- [Serverless or Regular Pod? How good is Flashboot? (RunPod Discord)](https://www.answeroverflow.com/m/1292890615922561076) +- [Very slow cold starts with FlashBoot (GitHub Issue)](https://github.com/runpod-workers/worker-vllm/issues/111) + +## Recommendations + +### For bursty workloads with predictable patterns (e.g. business-hours ingestion) + +**Set Active Workers = 0, Idle Timeout = 300s.** Workers stay warm between closely-spaced bursts and shut down during long gaps. FlashBoot handles the re-warm if the gap is short enough. + +Optionally, send a pre-warm request (e.g. `GET /v1/models`) before kicking off a batch job to absorb the cold start outside the critical path. + +### For unpredictable bursts with long idle gaps (hours/days) + +**Set Active Workers = 1.** One worker is always warm and handles the first request instantly. Flex workers scale up for the rest of the burst. This costs more (continuous billing at 40% discount) but guarantees no cold start penalty. + +### For cost-sensitive, latency-tolerant workloads + +**Set Active Workers = 0, rely on FlashBoot.** Accept that the first request after a long gap may take ~80s. Subsequent requests in the same burst will be fast. This is the cheapest option. + +### Cost comparison (rough estimate) + +Assuming an RTX A4500 at ~$0.29/hr on RunPod serverless: + +| Strategy | Monthly idle cost | Cold start risk | +|---|---|---| +| Active Workers = 0 | $0 | 1.4s–80s (unpredictable) | +| Active Workers = 1 | ~$210/mo | None | +| Idle Timeout = 300s | Depends on traffic | None within 5 min of last request | + +Compare to the previous always-on Vast.ai GPU at **~$650/mo**. + +## Scripts + +- `scripts/benchmark_coldstart.sh` — Measures cold start, warm inference, and burst latency with simple text prompts. +- `scripts/benchmark_inference.sh` — Measures realistic inference latency using actual document page images. + +### Usage + +```bash +# True cold start: scale to 0 in RunPod dashboard, wait for workers to fully terminate +./scripts/benchmark_coldstart.sh + +# Realistic document inference (run after endpoint is warm) +./scripts/benchmark_inference.sh + +# Custom parameters +WARM_REQUESTS=10 BURST_SIZE=10 ./scripts/benchmark_coldstart.sh +SAMPLE_IMAGE=/path/to/your/doc.png MAX_TOKENS=4096 ./scripts/benchmark_inference.sh +``` + +Results are saved as timestamped JSON files in `results/` (gitignored). diff --git a/scripts/benchmark_coldstart.sh b/scripts/benchmark_coldstart.sh new file mode 100755 index 0000000..bc7a676 --- /dev/null +++ b/scripts/benchmark_coldstart.sh @@ -0,0 +1,288 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ── Cold Start & Latency Benchmark for RunPod Serverless GPU ───────────────── +# +# Measures: +# 1. Cold start time — how long until the endpoint is ready (from idle) +# 2. First inference — first request latency (model may still be warming) +# 3. Warm inference — average latency over N requests on a hot worker +# 4. Burst throughput — N concurrent requests to simulate bursty ingestion +# +# Prerequisites: +# - RUNPOD_API_KEY and RUNPOD_ENDPOINT_ID set in .env or environment +# - Endpoint should be IDLE (0 active workers) for accurate cold start measurement +# You can scale to 0 in the RunPod dashboard before running this. + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV_FILE="${SCRIPT_DIR}/../.env" + +if [[ -f "$ENV_FILE" ]]; then + set -a; source "$ENV_FILE"; set +a +fi + +: "${RUNPOD_API_KEY:?Set RUNPOD_API_KEY in .env or environment}" +: "${RUNPOD_ENDPOINT_ID:?Set RUNPOD_ENDPOINT_ID in .env or environment}" + +BASE_URL="https://api.runpod.ai/v2/${RUNPOD_ENDPOINT_ID}/openai/v1" +AUTH="Authorization: Bearer ${RUNPOD_API_KEY}" +WARM_REQUESTS=${WARM_REQUESTS:-5} +BURST_SIZE=${BURST_SIZE:-5} +POLL_INTERVAL=5 +MAX_WAIT=300 + +# ── Helpers ────────────────────────────────────────────────────────────────── +ts() { python3 -c "import time; print(time.time())"; } +bold() { printf "\033[1m%s\033[0m\n" "$1"; } +info() { printf " %s\n" "$1"; } +ok() { printf "\033[32m ✓ %s\033[0m\n" "$1"; } +err() { printf "\033[31m ✗ %s\033[0m\n" "$1"; } +hr() { echo "────────────────────────────────────────────────────────"; } + +RESULTS_DIR="${SCRIPT_DIR}/../results" +mkdir -p "$RESULTS_DIR" +RESULTS_FILE="${RESULTS_DIR}/coldstart_$(date +%Y%m%d_%H%M%S).json" + +cat <&1) || true + end=$(ts) + code=$(echo "$response" | tail -1) + elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')") + echo "${elapsed} ${code}" +} + +# ══════════════════════════════════════════════════════════════════════════════ +# Phase 1: Cold Start — poll /v1/models until ready +# ══════════════════════════════════════════════════════════════════════════════ +hr +bold "Phase 1: Cold Start (polling /v1/models)" +info "Tip: scale endpoint to 0 workers first for an accurate measurement." +echo "" + +COLD_START_BEGIN=$(ts) +WAITED=0 +READY=false + +while (( WAITED < MAX_WAIT )); do + CODE=$(curl -s -o /dev/null -w "%{http_code}" \ + "${BASE_URL}/models" \ + -H "$AUTH" \ + --max-time 30 2>/dev/null) || CODE="000" + + if [[ "$CODE" == "200" ]]; then + READY=true + break + fi + + WAITED=$((WAITED + POLL_INTERVAL)) + printf "\r ⏳ %3ds — HTTP %s" "$WAITED" "$CODE" + sleep "$POLL_INTERVAL" +done +echo "" + +COLD_START_END=$(ts) +COLD_START_SECS=$(python3 -c "print(f'{${COLD_START_END} - ${COLD_START_BEGIN}:.1f}')") + +if $READY; then + ok "Endpoint ready in ${COLD_START_SECS}s" +else + err "Endpoint not ready after ${MAX_WAIT}s — aborting" + exit 1 +fi + +# Discover model name +MODEL_NAME=$(curl -s "${BASE_URL}/models" -H "$AUTH" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['data'][0]['id'])" 2>/dev/null \ + || echo "granite-docling-258M") +info "Model: ${MODEL_NAME}" + +# ══════════════════════════════════════════════════════════════════════════════ +# Phase 2: First Inference (may include model warm-up overhead) +# ══════════════════════════════════════════════════════════════════════════════ +echo "" +hr +bold "Phase 2: First Inference" + +read -r FIRST_LATENCY FIRST_CODE <<< "$(do_request)" + +if [[ "$FIRST_CODE" == "200" ]]; then + ok "First inference: ${FIRST_LATENCY}s" +else + err "First inference failed: HTTP ${FIRST_CODE} (${FIRST_LATENCY}s)" +fi + +# ══════════════════════════════════════════════════════════════════════════════ +# Phase 3: Warm Inference — sequential requests on a hot worker +# ══════════════════════════════════════════════════════════════════════════════ +echo "" +hr +bold "Phase 3: Warm Inference (${WARM_REQUESTS} sequential requests)" + +WARM_TOTAL=0 +WARM_MIN=999999 +WARM_MAX=0 +WARM_FAILURES=0 +WARM_LATENCIES="" + +for i in $(seq 1 "$WARM_REQUESTS"); do + read -r LAT CODE <<< "$(do_request)" + if [[ "$CODE" == "200" ]]; then + printf " [%d/%d] %ss\n" "$i" "$WARM_REQUESTS" "$LAT" + WARM_LATENCIES="${WARM_LATENCIES} ${LAT}" + WARM_TOTAL=$(python3 -c "print(${WARM_TOTAL} + ${LAT})") + WARM_MIN=$(python3 -c "print(min(${WARM_MIN}, ${LAT}))") + WARM_MAX=$(python3 -c "print(max(${WARM_MAX}, ${LAT}))") + else + err "[${i}/${WARM_REQUESTS}] HTTP ${CODE} (${LAT}s)" + WARM_FAILURES=$((WARM_FAILURES + 1)) + fi +done + +WARM_SUCCESS=$((WARM_REQUESTS - WARM_FAILURES)) +if (( WARM_SUCCESS > 0 )); then + WARM_AVG=$(python3 -c "print(f'{${WARM_TOTAL} / ${WARM_SUCCESS}:.3f}')") + WARM_P50=$(python3 -c " +import statistics +lats = [float(x) for x in '${WARM_LATENCIES}'.split()] +print(f'{statistics.median(lats):.3f}') +") + echo "" + ok "Avg: ${WARM_AVG}s | P50: ${WARM_P50}s | Min: ${WARM_MIN}s | Max: ${WARM_MAX}s" + if (( WARM_FAILURES > 0 )); then + err "${WARM_FAILURES}/${WARM_REQUESTS} requests failed" + fi +else + WARM_AVG="N/A"; WARM_P50="N/A"; WARM_MIN="N/A"; WARM_MAX="N/A" + err "All warm requests failed" +fi + +# ══════════════════════════════════════════════════════════════════════════════ +# Phase 4: Burst Test — concurrent requests to simulate ingestion spike +# ══════════════════════════════════════════════════════════════════════════════ +echo "" +hr +bold "Phase 4: Burst Test (${BURST_SIZE} concurrent requests)" + +BURST_DIR=$(mktemp -d /tmp/burst_XXXX) +BURST_START=$(ts) + +for i in $(seq 1 "$BURST_SIZE"); do + ( + start=$(ts) + code=$(curl -s -o /dev/null -w "%{http_code}" \ + "${BASE_URL}/chat/completions" \ + -H "$AUTH" \ + -H "Content-Type: application/json" \ + --max-time 300 \ + -d '{ + "model": "'"${MODEL_NAME}"'", + "messages": [{"role": "user", "content": "Hello"}], + "temperature": 0.0, + "max_tokens": 16 + }' 2>/dev/null) || code="000" + end=$(ts) + elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')") + echo "${elapsed} ${code}" > "${BURST_DIR}/${i}.txt" + ) & +done + +wait +BURST_END=$(ts) +BURST_WALL=$(python3 -c "print(f'{${BURST_END} - ${BURST_START}:.3f}')") + +BURST_LATS="" +BURST_OK=0 +BURST_FAIL=0 + +for f in "${BURST_DIR}"/*.txt; do + read -r LAT CODE < "$f" + if [[ "$CODE" == "200" ]]; then + BURST_LATS="${BURST_LATS} ${LAT}" + BURST_OK=$((BURST_OK + 1)) + info "[ok] ${LAT}s" + else + BURST_FAIL=$((BURST_FAIL + 1)) + err "[HTTP ${CODE}] ${LAT}s" + fi +done + +rm -rf "$BURST_DIR" + +echo "" +ok "Wall time: ${BURST_WALL}s | ${BURST_OK}/${BURST_SIZE} succeeded" + +if (( BURST_OK > 0 )); then + BURST_STATS=$(python3 -c " +import statistics +lats = [float(x) for x in '${BURST_LATS}'.split()] +print(f'Avg: {statistics.mean(lats):.3f}s | P50: {statistics.median(lats):.3f}s | Max: {max(lats):.3f}s') +") + ok "${BURST_STATS}" +fi + +# ══════════════════════════════════════════════════════════════════════════════ +# Summary +# ══════════════════════════════════════════════════════════════════════════════ +echo "" +hr +bold "Summary" +echo "" +printf " %-25s %s\n" "Cold start:" "${COLD_START_SECS}s" +printf " %-25s %s\n" "First inference:" "${FIRST_LATENCY}s" +printf " %-25s %s\n" "Warm avg (${WARM_SUCCESS} reqs):" "${WARM_AVG}s" +printf " %-25s %s\n" "Warm P50:" "${WARM_P50}s" +printf " %-25s %s\n" "Burst wall (${BURST_SIZE} reqs):" "${BURST_WALL}s" +echo "" + +# ── Write JSON results ─────────────────────────────────────────────────────── +python3 -c " +import json, datetime +results = { + 'timestamp': datetime.datetime.utcnow().isoformat() + 'Z', + 'endpoint_id': '${RUNPOD_ENDPOINT_ID}', + 'model': '${MODEL_NAME}', + 'cold_start_secs': ${COLD_START_SECS}, + 'first_inference_secs': ${FIRST_LATENCY}, + 'warm': { + 'requests': ${WARM_SUCCESS}, + 'avg_secs': ${WARM_AVG} if '${WARM_AVG}' != 'N/A' else None, + 'p50_secs': ${WARM_P50} if '${WARM_P50}' != 'N/A' else None, + 'min_secs': ${WARM_MIN} if '${WARM_MIN}' != 'N/A' else None, + 'max_secs': ${WARM_MAX} if '${WARM_MAX}' != 'N/A' else None, + }, + 'burst': { + 'concurrency': ${BURST_SIZE}, + 'wall_secs': ${BURST_WALL}, + 'succeeded': ${BURST_OK}, + 'failed': ${BURST_FAIL}, + } +} +with open('${RESULTS_FILE}', 'w') as f: + json.dump(results, f, indent=2) +" + +ok "Results written to ${RESULTS_FILE}" +echo "" diff --git a/scripts/benchmark_inference.sh b/scripts/benchmark_inference.sh new file mode 100755 index 0000000..e525903 --- /dev/null +++ b/scripts/benchmark_inference.sh @@ -0,0 +1,341 @@ +#!/usr/bin/env bash +set -euo pipefail + +# ── Inference Latency Benchmark (with real document image) ─────────────────── +# +# Measures realistic inference times by sending an actual document page image +# to the endpoint, simulating real ingestion workloads. +# +# Measures: +# 1. Image download + base64 encoding overhead +# 2. Single document inference (full output) +# 3. Warm sequential inference (N requests) +# 4. Burst concurrent inference (N parallel requests) +# +# Prerequisites: +# - RUNPOD_API_KEY and RUNPOD_ENDPOINT_ID in .env or environment +# - Endpoint should already be WARM (run benchmark_coldstart.sh first) +# +# Usage: +# ./scripts/benchmark_inference.sh # defaults +# WARM_REQUESTS=10 BURST_SIZE=10 ./scripts/benchmark_inference.sh +# SAMPLE_IMAGE=/path/to/your/doc.png ./scripts/benchmark_inference.sh + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ENV_FILE="${SCRIPT_DIR}/../.env" + +if [[ -f "$ENV_FILE" ]]; then + set -a; source "$ENV_FILE"; set +a +fi + +: "${RUNPOD_API_KEY:?Set RUNPOD_API_KEY in .env or environment}" +: "${RUNPOD_ENDPOINT_ID:?Set RUNPOD_ENDPOINT_ID in .env or environment}" + +BASE_URL="https://api.runpod.ai/v2/${RUNPOD_ENDPOINT_ID}/openai/v1" +AUTH="Authorization: Bearer ${RUNPOD_API_KEY}" +WARM_REQUESTS=${WARM_REQUESTS:-3} +BURST_SIZE=${BURST_SIZE:-3} +MAX_TOKENS=${MAX_TOKENS:-4096} + +# ── Helpers ────────────────────────────────────────────────────────────────── +ts() { python3 -c "import time; print(time.time())"; } +bold() { printf "\033[1m%s\033[0m\n" "$1"; } +info() { printf " %s\n" "$1"; } +ok() { printf "\033[32m ✓ %s\033[0m\n" "$1"; } +err() { printf "\033[31m ✗ %s\033[0m\n" "$1"; } +hr() { echo "────────────────────────────────────────────────────────"; } + +RESULTS_DIR="${SCRIPT_DIR}/../results" +mkdir -p "$RESULTS_DIR" +RESULTS_FILE="${RESULTS_DIR}/inference_$(date +%Y%m%d_%H%M%S).json" + +# ── Prepare sample document image ──────────────────────────────────────────── +hr +bold "Preparing document image" + +SAMPLE_IMG_URL="https://huggingface.co/ibm-granite/granite-docling-258M/resolve/main/examples/input/wikipedia_example.png" + +if [[ -n "${SAMPLE_IMAGE:-}" && -f "${SAMPLE_IMAGE}" ]]; then + info "Using provided image: ${SAMPLE_IMAGE}" + SAMPLE_B64=$(base64 < "$SAMPLE_IMAGE") + IMG_SIZE_KB=$(( $(wc -c < "$SAMPLE_IMAGE") / 1024 )) +else + info "Downloading sample from HuggingFace..." + TMP_IMG=$(mktemp /tmp/sample_page_XXXX.png) + if curl -sL -o "$TMP_IMG" "$SAMPLE_IMG_URL" && [[ -s "$TMP_IMG" ]]; then + SAMPLE_B64=$(base64 < "$TMP_IMG") + IMG_SIZE_KB=$(( $(wc -c < "$TMP_IMG") / 1024 )) + rm -f "$TMP_IMG" + else + err "Could not download sample image. Provide one via SAMPLE_IMAGE=/path/to/doc.png" + rm -f "$TMP_IMG" + exit 1 + fi +fi + +B64_SIZE_KB=$(( ${#SAMPLE_B64} / 1024 )) +ok "Image ready (${IMG_SIZE_KB} KB raw, ${B64_SIZE_KB} KB base64)" + +# ── Check endpoint is warm ────────────────────────────────────────────────── +echo "" +hr +bold "Checking endpoint is warm" + +CODE=$(curl -s -o /dev/null -w "%{http_code}" \ + "${BASE_URL}/models" -H "$AUTH" --max-time 15 2>/dev/null) || CODE="000" + +if [[ "$CODE" == "200" ]]; then + ok "Endpoint is ready" +else + err "Endpoint returned HTTP ${CODE} — run benchmark_coldstart.sh first to warm it up" + exit 1 +fi + +MODEL_NAME=$(curl -s "${BASE_URL}/models" -H "$AUTH" \ + | python3 -c "import sys,json; print(json.load(sys.stdin)['data'][0]['id'])" 2>/dev/null \ + || echo "granite-docling-258M") +info "Model: ${MODEL_NAME}" + +# ── Build the request payload ──────────────────────────────────────────────── +# This matches how granite-docling is used: send a page image, get docling markup +PAYLOAD_FILE=$(mktemp /tmp/bench_payload_XXXX.json) +python3 -c " +import json +payload = { + 'model': '${MODEL_NAME}', + 'messages': [{ + 'role': 'user', + 'content': [ + {'type': 'image_url', 'image_url': {'url': 'data:image/png;base64,${SAMPLE_B64}'}}, + {'type': 'text', 'text': 'Convert this page to docling markup.'} + ] + }], + 'temperature': 0.0, + 'max_tokens': ${MAX_TOKENS} +} +# Write without the base64 in logs +with open('${PAYLOAD_FILE}', 'w') as f: + json.dump(payload, f) +" +PAYLOAD_SIZE_KB=$(( $(wc -c < "$PAYLOAD_FILE") / 1024 )) +info "Payload size: ${PAYLOAD_SIZE_KB} KB" + +cat <&1) || true + end=$(ts) + code=$(echo "$response" | tail -1) + body=$(echo "$response" | sed '$d') + elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')") + tokens=$(echo "$body" | python3 -c " +import sys, json +try: + r = json.load(sys.stdin) + print(r.get('usage', {}).get('completion_tokens', r.get('usage', {}).get('total_tokens', '?'))) +except: print('?') +" 2>/dev/null) + echo "${elapsed} ${code} ${tokens}" +} + +# ══════════════════════════════════════════════════════════════════════════════ +# Phase 1: First Document Inference +# ══════════════════════════════════════════════════════════════════════════════ +hr +bold "Phase 1: First Document Inference" + +read -r FIRST_LAT FIRST_CODE FIRST_TOKENS <<< "$(do_image_request)" + +if [[ "$FIRST_CODE" == "200" ]]; then + ok "First inference: ${FIRST_LAT}s (${FIRST_TOKENS} tokens)" +else + err "First inference failed: HTTP ${FIRST_CODE} (${FIRST_LAT}s)" + # Show error body for debugging + curl -s "${BASE_URL}/chat/completions" \ + -H "$AUTH" -H "Content-Type: application/json" \ + --max-time 60 -d @"${PAYLOAD_FILE}" 2>&1 | python3 -c " +import sys, json +try: print(json.dumps(json.load(sys.stdin), indent=2)[:500]) +except: print(sys.stdin.read()[:500]) +" || true +fi + +# ══════════════════════════════════════════════════════════════════════════════ +# Phase 2: Warm Sequential Inference +# ══════════════════════════════════════════════════════════════════════════════ +echo "" +hr +bold "Phase 2: Warm Inference (${WARM_REQUESTS} sequential document requests)" + +WARM_TOTAL=0 +WARM_MIN=999999 +WARM_MAX=0 +WARM_FAILURES=0 +WARM_LATENCIES="" +WARM_TOKENS_TOTAL=0 + +for i in $(seq 1 "$WARM_REQUESTS"); do + read -r LAT CODE TOKENS <<< "$(do_image_request)" + if [[ "$CODE" == "200" ]]; then + printf " [%d/%d] %ss (%s tokens)\n" "$i" "$WARM_REQUESTS" "$LAT" "$TOKENS" + WARM_LATENCIES="${WARM_LATENCIES} ${LAT}" + WARM_TOTAL=$(python3 -c "print(${WARM_TOTAL} + ${LAT})") + WARM_MIN=$(python3 -c "print(min(${WARM_MIN}, ${LAT}))") + WARM_MAX=$(python3 -c "print(max(${WARM_MAX}, ${LAT}))") + if [[ "$TOKENS" != "?" ]]; then + WARM_TOKENS_TOTAL=$((WARM_TOKENS_TOTAL + TOKENS)) + fi + else + err "[${i}/${WARM_REQUESTS}] HTTP ${CODE} (${LAT}s)" + WARM_FAILURES=$((WARM_FAILURES + 1)) + fi +done + +WARM_SUCCESS=$((WARM_REQUESTS - WARM_FAILURES)) +if (( WARM_SUCCESS > 0 )); then + WARM_AVG=$(python3 -c "print(f'{${WARM_TOTAL} / ${WARM_SUCCESS}:.3f}')") + WARM_P50=$(python3 -c " +import statistics +lats = [float(x) for x in '${WARM_LATENCIES}'.split()] +print(f'{statistics.median(lats):.3f}') +") + WARM_TOKENS_AVG=$((WARM_TOKENS_TOTAL / WARM_SUCCESS)) + echo "" + ok "Avg: ${WARM_AVG}s | P50: ${WARM_P50}s | Min: ${WARM_MIN}s | Max: ${WARM_MAX}s" + ok "Avg tokens/request: ~${WARM_TOKENS_AVG}" +else + WARM_AVG="N/A"; WARM_P50="N/A"; WARM_MIN="N/A"; WARM_MAX="N/A"; WARM_TOKENS_AVG=0 + err "All warm requests failed" +fi + +# ══════════════════════════════════════════════════════════════════════════════ +# Phase 3: Burst Test — concurrent document requests +# ══════════════════════════════════════════════════════════════════════════════ +echo "" +hr +bold "Phase 3: Burst Test (${BURST_SIZE} concurrent document requests)" + +BURST_DIR=$(mktemp -d /tmp/burst_img_XXXX) +BURST_START=$(ts) + +for i in $(seq 1 "$BURST_SIZE"); do + ( + start=$(ts) + response=$(curl -s -w "\n%{http_code}" \ + "${BASE_URL}/chat/completions" \ + -H "$AUTH" \ + -H "Content-Type: application/json" \ + --max-time 600 \ + -d @"${PAYLOAD_FILE}" 2>&1) || true + end=$(ts) + code=$(echo "$response" | tail -1) + elapsed=$(python3 -c "print(f'{${end} - ${start}:.3f}')") + echo "${elapsed} ${code}" > "${BURST_DIR}/${i}.txt" + ) & +done + +wait +BURST_END=$(ts) +BURST_WALL=$(python3 -c "print(f'{${BURST_END} - ${BURST_START}:.3f}')") + +BURST_LATS="" +BURST_OK=0 +BURST_FAIL=0 + +for f in "${BURST_DIR}"/*.txt; do + read -r LAT CODE < "$f" + if [[ "$CODE" == "200" ]]; then + BURST_LATS="${BURST_LATS} ${LAT}" + BURST_OK=$((BURST_OK + 1)) + info "[ok] ${LAT}s" + else + BURST_FAIL=$((BURST_FAIL + 1)) + err "[HTTP ${CODE}] ${LAT}s" + fi +done + +rm -rf "$BURST_DIR" + +echo "" +ok "Wall time: ${BURST_WALL}s | ${BURST_OK}/${BURST_SIZE} succeeded" + +if (( BURST_OK > 0 )); then + BURST_STATS=$(python3 -c " +import statistics +lats = [float(x) for x in '${BURST_LATS}'.split()] +print(f'Avg: {statistics.mean(lats):.3f}s | P50: {statistics.median(lats):.3f}s | Max: {max(lats):.3f}s') +") + ok "${BURST_STATS}" +fi + +# ══════════════════════════════════════════════════════════════════════════════ +# Summary +# ══════════════════════════════════════════════════════════════════════════════ +echo "" +hr +bold "Summary" +echo "" +printf " %-30s %s\n" "First doc inference:" "${FIRST_LAT}s (${FIRST_TOKENS} tokens)" +printf " %-30s %s\n" "Warm avg (${WARM_SUCCESS} reqs):" "${WARM_AVG}s" +printf " %-30s %s\n" "Warm P50:" "${WARM_P50}s" +printf " %-30s %s\n" "Warm min/max:" "${WARM_MIN}s / ${WARM_MAX}s" +printf " %-30s %s\n" "Avg tokens/request:" "~${WARM_TOKENS_AVG}" +printf " %-30s %s\n" "Burst wall (${BURST_SIZE} reqs):" "${BURST_WALL}s" +echo "" + +# ── Write JSON results ─────────────────────────────────────────────────────── +python3 -c " +import json, datetime +results = { + 'timestamp': datetime.datetime.utcnow().isoformat() + 'Z', + 'endpoint_id': '${RUNPOD_ENDPOINT_ID}', + 'model': '${MODEL_NAME}', + 'image_size_kb': ${IMG_SIZE_KB}, + 'payload_size_kb': ${PAYLOAD_SIZE_KB}, + 'max_tokens': ${MAX_TOKENS}, + 'first_inference_secs': ${FIRST_LAT}, + 'first_inference_tokens': '${FIRST_TOKENS}', + 'warm': { + 'requests': ${WARM_SUCCESS}, + 'avg_secs': ${WARM_AVG} if '${WARM_AVG}' != 'N/A' else None, + 'p50_secs': ${WARM_P50} if '${WARM_P50}' != 'N/A' else None, + 'min_secs': ${WARM_MIN} if '${WARM_MIN}' != 'N/A' else None, + 'max_secs': ${WARM_MAX} if '${WARM_MAX}' != 'N/A' else None, + 'avg_tokens': ${WARM_TOKENS_AVG}, + }, + 'burst': { + 'concurrency': ${BURST_SIZE}, + 'wall_secs': ${BURST_WALL}, + 'succeeded': ${BURST_OK}, + 'failed': ${BURST_FAIL}, + } +} +with open('${RESULTS_FILE}', 'w') as f: + json.dump(results, f, indent=2) +" + +ok "Results written to ${RESULTS_FILE}" + +# ── Cleanup ────────────────────────────────────────────────────────────────── +rm -f "$PAYLOAD_FILE" +echo "" diff --git a/scripts/test_endpoint.sh b/scripts/test_endpoint.sh index 092ca97..507f8be 100755 --- a/scripts/test_endpoint.sh +++ b/scripts/test_endpoint.sh @@ -12,7 +12,7 @@ fi : "${RUNPOD_API_KEY:?Set RUNPOD_API_KEY in .env or environment}" : "${RUNPOD_ENDPOINT_ID:?Set RUNPOD_ENDPOINT_ID in .env or environment}" -BASE_URL="https://${RUNPOD_ENDPOINT_ID}.api.runpod.ai/openai/v1" +BASE_URL="https://api.runpod.ai/v2/${RUNPOD_ENDPOINT_ID}/openai/v1" AUTH="Authorization: Bearer ${RUNPOD_API_KEY}" # ── Helpers ──────────────────────────────────────────────────────────────────