Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
314 changes: 314 additions & 0 deletions Taskfile.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,314 @@
# https://taskfile.dev
version: '3'

vars:
MODEL: '{{.MODEL | default "google/gemma-3-27b-it"}}'
PORT: '{{.PORT | default "8000"}}'
TRACE_DIR: '{{.TRACE_DIR | default "/tmp/vllm-traces"}}'
LOCAL_TRACE_DIR: '{{.LOCAL_TRACE_DIR | default "./traces"}}'
PYTHON: .venv/bin/python
LOG_FILE: /tmp/vllm-server.log
# false = only custom spans visible; true = full Python stacks (needed for flamegraph)
WITH_STACK: '{{.WITH_STACK | default "true"}}'
# Stable public image used for ad-hoc profiling requests
IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}'
# High-res public domain image (~8 MP)
IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://picsum.photos/seed/vllm/3000/2000.jpg"}}'

tasks:
default:
desc: List all tasks
cmds:
- task --list
silent: true

# ── Server ────────────────────────────────────────────────────────────

serve:
desc: "Start vLLM with torch profiler + custom record_function scopes. Logs tee → {{.LOG_FILE}}"
cmds:
- mkdir -p {{.TRACE_DIR}}
- |
VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
.venv/bin/vllm serve {{.MODEL}} \
--port {{.PORT}} \
--profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}","torch_profiler_with_stack":{{.WITH_STACK}},"max_iterations":1}' \
2>&1 | tee {{.LOG_FILE}}

serve:plain:
desc: Start vLLM without profiler (baseline / smoke test)
cmds:
- .venv/bin/vllm serve {{.MODEL}} --port {{.PORT}} 2>&1 | tee {{.LOG_FILE}}

# ── Profiler control ──────────────────────────────────────────────────

profile:start:
desc: POST /start_profile — open a trace window
cmds:
- curl -fsS -X POST http://localhost:{{.PORT}}/start_profile && echo " profiler started"

profile:stop:
desc: POST /stop_profile — flush trace files to {{.TRACE_DIR}}
cmds:
- curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile && echo " profiler stopped"

profile:request:
desc: "Send one VLM chat request with an image URL (IMAGE_URL= to override)"
cmds:
- |
curl -s http://localhost:{{.PORT}}/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "{{.MODEL}}",
"messages": [{
"role": "user",
"content": [
{"type": "image_url", "image_url": {"url": "{{.IMAGE_URL}}"}},
{"type": "text", "text": "Describe this image in one sentence."}
]
}],
"max_tokens": 64
}' | {{.PYTHON}} -m json.tool

profile:run:
desc: "One-shot profiling: start → one VLM request → stop (captures one request end-to-end)"
cmds:
- task: profile:start
- task: profile:request
- task: profile:stop
- task: traces:ls

# ── Stress test ───────────────────────────────────────────────────────

stress:
desc: "Stress test the render endpoint. Override: N=50 C=8 task stress"
vars:
N: '{{.N | default "20"}}'
C: '{{.C | default "4"}}'
cmds:
- |
{{.PYTHON}} stress_test_render.py \
--url http://localhost:{{.PORT}} \
--n {{.N}} \
--concurrency {{.C}} \
--mode chat

# ── Traces ────────────────────────────────────────────────────────────

traces:pull:
desc: "Copy latest traces from pod → LOCAL_TRACE_DIR (default ./traces)"
vars:
POD: '{{.POD | default "vllm-dev"}}'
cmds:
- mkdir -p {{.LOCAL_TRACE_DIR}}
- |
frontend=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/*.async_llm.*.json.gz 2>/dev/null | head -1")
gpu=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1")
[ -z "$frontend" ] && echo "No frontend trace found" && exit 1
[ -z "$gpu" ] && echo "No GPU trace found" && exit 1
oc cp {{.POD}}:$frontend {{.LOCAL_TRACE_DIR}}/frontend.json.gz
oc cp {{.POD}}:$gpu {{.LOCAL_TRACE_DIR}}/rank0.json.gz
echo "Pulled to {{.LOCAL_TRACE_DIR}}/"
ls -lh {{.LOCAL_TRACE_DIR}}/*.json.gz

traces:summary:
desc: "Build summary.json.gz from frontend + GPU traces (LOCAL_TRACE_DIR= to override)"
cmds:
- |
{{.PYTHON}} tools/summary_trace.py \
{{.LOCAL_TRACE_DIR}}/frontend.json.gz \
{{.LOCAL_TRACE_DIR}}/rank0.json.gz \
{{.LOCAL_TRACE_DIR}}/summary.json.gz

traces:ls:
desc: List trace files in TRACE_DIR, split by frontend vs GPU worker
cmds:
- |
echo "=== Traces in {{.TRACE_DIR}} ==="
echo ""
echo "Frontend (event-loop: url_download, mm_processor, prefill/decode labels):"
ls -lht {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null || echo " (none)"
echo ""
echo "GPU worker (mm_encoder:forward, execute_model, CUDA kernels):"
ls -lht {{.TRACE_DIR}}/rank*.*.json.gz 2>/dev/null || echo " (none)"
silent: true

traces:open:
desc: Open Perfetto UI and print the latest frontend + GPU trace paths to upload
cmds:
- open https://ui.perfetto.dev
- |
echo ""
echo "Drag & drop both files into Perfetto (or File → Open Trace File):"
echo ""
printf " Frontend : "
ls -t {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null | head -1 || echo "(none)"
printf " GPU : "
ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1 || echo "(none)"
echo ""
echo "Both traces share wall-clock time and will align on the same timeline."
silent: true

traces:unzip:
desc: Decompress all .gz traces (needed by tools that require raw JSON)
cmds:
- |
found=0
for f in {{.TRACE_DIR}}/*.json.gz; do
[ -f "$f" ] || continue
found=1
out="${f%.gz}"
if [ ! -f "$out" ]; then
gunzip -k "$f" && echo "Decompressed: $out"
else
echo "Already exists: $out"
fi
done
[ "$found" -eq 0 ] && echo "No .gz traces found in {{.TRACE_DIR}}"

traces:clean:
desc: Remove all traces from TRACE_DIR
prompt: "Delete all traces in {{.TRACE_DIR}}?"
cmds:
- rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.json
- echo "Cleaned {{.TRACE_DIR}}"

# ── Experiments ───────────────────────────────────────────────────────
# Each experiment restarts the server to ensure a cold cache, then captures
# one profiled request and saves traces to traces/expN/.
#
# Experiments:
# 1. small image — URL
# 2. hi-res image — URL
# 3. small image — base64
# 4. hi-res image — base64

_exp:restart-server:
internal: true
desc: Kill + restart vllm in pod with profiling, wait until ready
vars:
POD: '{{.POD | default "vllm-dev"}}'
cmds:
# Step 1a: kill old server.
# Run pkill directly (no bash -c wrapper) so the search pattern does not
# appear in any wrapper process's cmdline. pkill always excludes itself.
- oc exec {{.POD}} -- pkill -f /workspace/vllm/.venv/bin/vllm || true
# Step 1b: wipe old traces and launch new server (no sleep, no wait)
- |
oc exec {{.POD}} -- bash -c "
rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.txt
nohup env VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
HF_TOKEN='$HF_TOKEN' HUGGING_FACE_HUB_TOKEN='$HF_TOKEN' \
/workspace/vllm/.venv/bin/vllm serve {{.MODEL}} \
--port {{.PORT}} \
--profiler-config '{\"profiler\":\"torch\",\"torch_profiler_dir\":\"{{.TRACE_DIR}}\",\"torch_profiler_with_stack\":{{.WITH_STACK}},\"max_iterations\":1}' \
> /tmp/vllm-serve.log 2>&1 &
echo \"Server PID \$!\""
# Step 2: poll locally so oc exec never sits open for 2+ minutes
- |
echo "Waiting for server on {{.POD}}:{{.PORT}}..."
until oc exec {{.POD}} -- curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do
sleep 10
done
echo ready

_exp:capture:
internal: true
desc: "Profile one URL-based request. Vars: IMAGE_URL, OUTDIR"
vars:
POD: '{{.POD | default "vllm-dev"}}'
cmds:
- |
oc exec {{.POD}} -- bash -c "
curl -fsS -X POST http://localhost:{{.PORT}}/start_profile
curl -sf http://localhost:{{.PORT}}/v1/chat/completions \
-H 'Content-Type: application/json' \
-d '{\"model\":\"{{.MODEL}}\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"image_url\",\"image_url\":{\"url\":\"{{.IMAGE_URL}}\"}},{\"type\":\"text\",\"text\":\"Describe this image in one sentence.\"}]}],\"max_tokens\":64}' \
> /dev/null
curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile"
- task: traces:pull
vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
- task: traces:summary
vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }

_exp:capture-base64:
internal: true
# Use oc port-forward so capture_request.py runs locally with --base64,
# downloading the image locally and sending the encoded payload to the pod.
# This avoids heredoc-in-YAML issues and keeps Python code out of the Taskfile.
desc: "Profile one base64 request. Vars: IMAGE_URL, OUTDIR"
vars:
POD: '{{.POD | default "vllm-dev"}}'
LOCAL_PORT: "18000"
cmds:
- |
oc port-forward {{.POD}} {{.LOCAL_PORT}}:{{.PORT}} &
PFW=$!
trap "kill $PFW 2>/dev/null" EXIT
sleep 2
{{.PYTHON}} tools/capture_request.py \
--endpoint http://localhost:{{.LOCAL_PORT}} \
--model {{.MODEL}} \
--image {{.IMAGE_URL}} \
--base64
kill $PFW 2>/dev/null || true
- task: traces:pull
vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
- task: traces:summary
vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }

exp:1:
desc: "Experiment 1 — small image, URL delivery"
cmds:
- task: _exp:restart-server
- task: _exp:capture
vars:
IMAGE_URL: "{{.IMAGE_URL}}"
OUTDIR: ./traces/exp1_small_url

exp:2:
desc: "Experiment 2 — hi-res image, URL delivery"
cmds:
- task: _exp:restart-server
- task: _exp:capture
vars:
IMAGE_URL: "{{.IMAGE_URL_HIRES}}"
OUTDIR: ./traces/exp2_hires_url

exp:3:
desc: "Experiment 3 — small image, base64 delivery"
cmds:
- task: _exp:restart-server
- task: _exp:capture-base64
vars:
IMAGE_URL: "{{.IMAGE_URL}}"
OUTDIR: ./traces/exp3_small_base64

exp:4:
desc: "Experiment 4 — hi-res image, base64 delivery"
cmds:
- task: _exp:restart-server
- task: _exp:capture-base64
vars:
IMAGE_URL: "{{.IMAGE_URL_HIRES}}"
OUTDIR: ./traces/exp4_hires_base64

exp:all:
desc: "Run all 4 experiments sequentially (server restart between each)"
cmds:
- task: exp:1
- task: exp:2
- task: exp:3
- task: exp:4

# ── Logs ──────────────────────────────────────────────────────────────

logs:
desc: "Tail the server log (task serve tees output to {{.LOG_FILE}})"
cmds:
- tail -f {{.LOG_FILE}}

logs:grep:
desc: "Search server log for keyword: PATTERN=mm_processor task logs:grep"
cmds:
- grep -i "{{.PATTERN}}" {{.LOG_FILE}} | tail -100
Loading
Loading