sagearc · sagearc · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/Taskfile.yml b/Taskfile.yml
@@ -0,0 +1,314 @@
+# https://taskfile.dev
+version: '3'
+
+vars:
+  MODEL: '{{.MODEL | default "google/gemma-3-27b-it"}}'
+  PORT: '{{.PORT | default "8000"}}'
+  TRACE_DIR: '{{.TRACE_DIR | default "/tmp/vllm-traces"}}'
+  LOCAL_TRACE_DIR: '{{.LOCAL_TRACE_DIR | default "./traces"}}'
+  PYTHON: .venv/bin/python
+  LOG_FILE: /tmp/vllm-server.log
+  # false = only custom spans visible; true = full Python stacks (needed for flamegraph)
+  WITH_STACK: '{{.WITH_STACK | default "true"}}'
+  # Stable public image used for ad-hoc profiling requests
+  IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}'
+  # High-res public domain image (~8 MP)
+  IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://picsum.photos/seed/vllm/3000/2000.jpg"}}'
+
+tasks:
+  default:
+    desc: List all tasks
+    cmds:
+      - task --list
+    silent: true
+
+  # ── Server ────────────────────────────────────────────────────────────
+
+  serve:
+    desc: "Start vLLM with torch profiler + custom record_function scopes. Logs tee → {{.LOG_FILE}}"
+    cmds:
+      - mkdir -p {{.TRACE_DIR}}
+      - |
+        VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
+        .venv/bin/vllm serve {{.MODEL}} \
+          --port {{.PORT}} \
+          --profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}","torch_profiler_with_stack":{{.WITH_STACK}},"max_iterations":1}' \
+          2>&1 | tee {{.LOG_FILE}}
+
+  serve:plain:
+    desc: Start vLLM without profiler (baseline / smoke test)
+    cmds:
+      - .venv/bin/vllm serve {{.MODEL}} --port {{.PORT}} 2>&1 | tee {{.LOG_FILE}}
+
+  # ── Profiler control ──────────────────────────────────────────────────
+
+  profile:start:
+    desc: POST /start_profile — open a trace window
+    cmds:
+      - curl -fsS -X POST http://localhost:{{.PORT}}/start_profile && echo " profiler started"
+
+  profile:stop:
+    desc: POST /stop_profile — flush trace files to {{.TRACE_DIR}}
+    cmds:
+      - curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile && echo " profiler stopped"
+
+  profile:request:
+    desc: "Send one VLM chat request with an image URL (IMAGE_URL= to override)"
+    cmds:
+      - |
+        curl -s http://localhost:{{.PORT}}/v1/chat/completions \
+          -H "Content-Type: application/json" \
+          -d '{
+            "model": "{{.MODEL}}",
+            "messages": [{
+              "role": "user",
+              "content": [
+                {"type": "image_url", "image_url": {"url": "{{.IMAGE_URL}}"}},
+                {"type": "text", "text": "Describe this image in one sentence."}
+              ]
+            }],
+            "max_tokens": 64
+          }' | {{.PYTHON}} -m json.tool
+
+  profile:run:
+    desc: "One-shot profiling: start → one VLM request → stop (captures one request end-to-end)"
+    cmds:
+      - task: profile:start
+      - task: profile:request
+      - task: profile:stop
+      - task: traces:ls
+
+  # ── Stress test ───────────────────────────────────────────────────────
+
+  stress:
+    desc: "Stress test the render endpoint. Override: N=50 C=8 task stress"
+    vars:
+      N: '{{.N | default "20"}}'
+      C: '{{.C | default "4"}}'
+    cmds:
+      - |
+        {{.PYTHON}} stress_test_render.py \
+          --url http://localhost:{{.PORT}} \
+          --n {{.N}} \
+          --concurrency {{.C}} \
+          --mode chat
+
+  # ── Traces ────────────────────────────────────────────────────────────
+
+  traces:pull:
+    desc: "Copy latest traces from pod → LOCAL_TRACE_DIR (default ./traces)"
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+    cmds:
+      - mkdir -p {{.LOCAL_TRACE_DIR}}
+      - |
+        frontend=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/*.async_llm.*.json.gz 2>/dev/null | head -1")
+        gpu=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1")
+        [ -z "$frontend" ] && echo "No frontend trace found" && exit 1
+        [ -z "$gpu" ]      && echo "No GPU trace found" && exit 1
+        oc cp {{.POD}}:$frontend {{.LOCAL_TRACE_DIR}}/frontend.json.gz
+        oc cp {{.POD}}:$gpu      {{.LOCAL_TRACE_DIR}}/rank0.json.gz
+        echo "Pulled to {{.LOCAL_TRACE_DIR}}/"
+        ls -lh {{.LOCAL_TRACE_DIR}}/*.json.gz
+
+  traces:summary:
+    desc: "Build summary.json.gz from frontend + GPU traces (LOCAL_TRACE_DIR= to override)"
+    cmds:
+      - |
+        {{.PYTHON}} tools/summary_trace.py \
+          {{.LOCAL_TRACE_DIR}}/frontend.json.gz \
+          {{.LOCAL_TRACE_DIR}}/rank0.json.gz \
+          {{.LOCAL_TRACE_DIR}}/summary.json.gz
+
+  traces:ls:
+    desc: List trace files in TRACE_DIR, split by frontend vs GPU worker
+    cmds:
+      - |
+        echo "=== Traces in {{.TRACE_DIR}} ==="
+        echo ""
+        echo "Frontend (event-loop: url_download, mm_processor, prefill/decode labels):"
+        ls -lht {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null || echo "  (none)"
+        echo ""
+        echo "GPU worker (mm_encoder:forward, execute_model, CUDA kernels):"
+        ls -lht {{.TRACE_DIR}}/rank*.*.json.gz 2>/dev/null || echo "  (none)"
+    silent: true
+
+  traces:open:
+    desc: Open Perfetto UI and print the latest frontend + GPU trace paths to upload
+    cmds:
+      - open https://ui.perfetto.dev
+      - |
+        echo ""
+        echo "Drag & drop both files into Perfetto (or File → Open Trace File):"
+        echo ""
+        printf "  Frontend : "
+        ls -t {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null | head -1 || echo "(none)"
+        printf "  GPU      : "
+        ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1 || echo "(none)"
+        echo ""
+        echo "Both traces share wall-clock time and will align on the same timeline."
+    silent: true
+
+  traces:unzip:
+    desc: Decompress all .gz traces (needed by tools that require raw JSON)
+    cmds:
+      - |
+        found=0
+        for f in {{.TRACE_DIR}}/*.json.gz; do
+          [ -f "$f" ] || continue
+          found=1
+          out="${f%.gz}"
+          if [ ! -f "$out" ]; then
+            gunzip -k "$f" && echo "Decompressed: $out"
+          else
+            echo "Already exists: $out"
+          fi
+        done
+        [ "$found" -eq 0 ] && echo "No .gz traces found in {{.TRACE_DIR}}"
+
+  traces:clean:
+    desc: Remove all traces from TRACE_DIR
+    prompt: "Delete all traces in {{.TRACE_DIR}}?"
+    cmds:
+      - rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.json
+      - echo "Cleaned {{.TRACE_DIR}}"
+
+  # ── Experiments ───────────────────────────────────────────────────────
+  # Each experiment restarts the server to ensure a cold cache, then captures
+  # one profiled request and saves traces to traces/expN/.
+  #
+  # Experiments:
+  #   1. small image  — URL
+  #   2. hi-res image — URL
+  #   3. small image  — base64
+  #   4. hi-res image — base64
+
+  _exp:restart-server:
+    internal: true
+    desc: Kill + restart vllm in pod with profiling, wait until ready
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+    cmds:
+      # Step 1a: kill old server.
+      # Run pkill directly (no bash -c wrapper) so the search pattern does not
+      # appear in any wrapper process's cmdline. pkill always excludes itself.
+      - oc exec {{.POD}} -- pkill -f /workspace/vllm/.venv/bin/vllm || true
+      # Step 1b: wipe old traces and launch new server (no sleep, no wait)
+      - |
+        oc exec {{.POD}} -- bash -c "
+          rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.txt
+          nohup env VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
+            HF_TOKEN='$HF_TOKEN' HUGGING_FACE_HUB_TOKEN='$HF_TOKEN' \
+            /workspace/vllm/.venv/bin/vllm serve {{.MODEL}} \
+            --port {{.PORT}} \
+            --profiler-config '{\"profiler\":\"torch\",\"torch_profiler_dir\":\"{{.TRACE_DIR}}\",\"torch_profiler_with_stack\":{{.WITH_STACK}},\"max_iterations\":1}' \
+            > /tmp/vllm-serve.log 2>&1 &
+          echo \"Server PID \$!\""
+      # Step 2: poll locally so oc exec never sits open for 2+ minutes
+      - |
+        echo "Waiting for server on {{.POD}}:{{.PORT}}..."
+        until oc exec {{.POD}} -- curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do
+          sleep 10
+        done
+        echo ready
+
+  _exp:capture:
+    internal: true
+    desc: "Profile one URL-based request. Vars: IMAGE_URL, OUTDIR"
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+    cmds:
+      - |
+        oc exec {{.POD}} -- bash -c "
+          curl -fsS -X POST http://localhost:{{.PORT}}/start_profile
+          curl -sf http://localhost:{{.PORT}}/v1/chat/completions \
+            -H 'Content-Type: application/json' \
+            -d '{\"model\":\"{{.MODEL}}\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"image_url\",\"image_url\":{\"url\":\"{{.IMAGE_URL}}\"}},{\"type\":\"text\",\"text\":\"Describe this image in one sentence.\"}]}],\"max_tokens\":64}' \
+            > /dev/null
+          curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile"
+      - task: traces:pull
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+      - task: traces:summary
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+
+  _exp:capture-base64:
+    internal: true
+    # Use oc port-forward so capture_request.py runs locally with --base64,
+    # downloading the image locally and sending the encoded payload to the pod.
+    # This avoids heredoc-in-YAML issues and keeps Python code out of the Taskfile.
+    desc: "Profile one base64 request. Vars: IMAGE_URL, OUTDIR"
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+      LOCAL_PORT: "18000"
+    cmds:
+      - |
+        oc port-forward {{.POD}} {{.LOCAL_PORT}}:{{.PORT}} &
+        PFW=$!
+        trap "kill $PFW 2>/dev/null" EXIT
+        sleep 2
+        {{.PYTHON}} tools/capture_request.py \
+          --endpoint http://localhost:{{.LOCAL_PORT}} \
+          --model    {{.MODEL}} \
+          --image    {{.IMAGE_URL}} \
+          --base64
+        kill $PFW 2>/dev/null || true
+      - task: traces:pull
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+      - task: traces:summary
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+
+  exp:1:
+    desc: "Experiment 1 — small image, URL delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL}}"
+          OUTDIR: ./traces/exp1_small_url
+
+  exp:2:
+    desc: "Experiment 2 — hi-res image, URL delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL_HIRES}}"
+          OUTDIR: ./traces/exp2_hires_url
+
+  exp:3:
+    desc: "Experiment 3 — small image, base64 delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture-base64
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL}}"
+          OUTDIR: ./traces/exp3_small_base64
+
+  exp:4:
+    desc: "Experiment 4 — hi-res image, base64 delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture-base64
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL_HIRES}}"
+          OUTDIR: ./traces/exp4_hires_base64
+
+  exp:all:
+    desc: "Run all 4 experiments sequentially (server restart between each)"
+    cmds:
+      - task: exp:1
+      - task: exp:2
+      - task: exp:3
+      - task: exp:4
+
+  # ── Logs ──────────────────────────────────────────────────────────────
+
+  logs:
+    desc: "Tail the server log (task serve tees output to {{.LOG_FILE}})"
+    cmds:
+      - tail -f {{.LOG_FILE}}
+
+  logs:grep:
+    desc: "Search server log for keyword: PATTERN=mm_processor task logs:grep"
+    cmds:
+      - grep -i "{{.PATTERN}}" {{.LOG_FILE}} | tail -100