From 8fc2a7859b6e55682fecde5aff5f76354d45051f Mon Sep 17 00:00:00 2001
From: sagearc <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 12:18:26 +0000
Subject: [PATCH 1/9] profile encoder

---
 vllm/multimodal/media/connector.py    | 23 +++++++++++++----------
 vllm/multimodal/processing/context.py | 24 +++++++++++++-----------
 vllm/v1/engine/async_llm.py           | 15 +++++++++------
 vllm/v1/worker/gpu_model_runner.py    |  4 ++--
 4 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py
index 6ec90a45f1d0..7673497ba31b 100644
--- a/vllm/multimodal/media/connector.py
+++ b/vllm/multimodal/media/connector.py
@@ -23,6 +23,7 @@
 from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
 from vllm.utils.registry import ExtensionManager
+from vllm.v1.utils import record_function_or_nullcontext
 
 from .audio import AudioEmbeddingMediaIO, AudioMediaIO
 from .base import MediaIO
@@ -300,11 +301,12 @@ def load_from_url(
                 return media_io.load_bytes(cached)
 
             connection = self.connection
-            data = connection.get_bytes(
-                url_spec.url,
-                timeout=fetch_timeout,
-                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
-            )
+            with record_function_or_nullcontext("media: url_download"):
+                data = connection.get_bytes(
+                    url_spec.url,
+                    timeout=fetch_timeout,
+                    allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
+                )
 
             self._put_cached_bytes(url, data)
             return media_io.load_bytes(data)
@@ -341,11 +343,12 @@ async def load_from_url_async(
                 return await future
 
             connection = self.connection
-            data = await connection.async_get_bytes(
-                url_spec.url,
-                timeout=fetch_timeout,
-                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
-            )
+            with record_function_or_nullcontext("media: url_download"):
+                data = await connection.async_get_bytes(
+                    url_spec.url,
+                    timeout=fetch_timeout,
+                    allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
+                )
 
             await loop.run_in_executor(
                 global_thread_pool, self._put_cached_bytes, url, data
diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py
index ef9710374d81..068c1ea19e69 100644
--- a/vllm/multimodal/processing/context.py
+++ b/vllm/multimodal/processing/context.py
@@ -24,6 +24,7 @@
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
 from vllm.utils.mistral import is_mistral_tokenizer
+from vllm.v1.utils import record_function_or_nullcontext
 
 if TYPE_CHECKING:
     from transformers.configuration_utils import PretrainedConfig
@@ -60,17 +61,18 @@ def total_secs(self) -> float:
     @contextmanager
     def record(self, stage: str):
         """Record the execution time for a processing stage."""
-        if not self.enabled:
-            yield
-            return
-
-        start_time = time.perf_counter()
-        try:
-            yield
-        finally:
-            elapsed = time.perf_counter() - start_time
-            self.stage_secs.setdefault(stage, 0.0)
-            self.stage_secs[stage] += elapsed
+        with record_function_or_nullcontext(f"mm_processor: {stage}"):
+            if not self.enabled:
+                yield
+                return
+
+            start_time = time.perf_counter()
+            try:
+                yield
+            finally:
+                elapsed = time.perf_counter() - start_time
+                self.stage_secs.setdefault(stage, 0.0)
+                self.stage_secs[stage] += elapsed
 
     def get_stats_dict(self):
         stats_dict = {
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 1c87d9ec0944..6f7b79c87d29 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -871,16 +871,19 @@ async def check_health(self) -> None:
             raise self.dead_error
 
     async def start_profile(self, profile_prefix: str | None = None) -> None:
-        coros = [self.engine_core.profile_async(True, profile_prefix)]
+        # The torch profiler must be started on the event-loop thread: kineto
+        # registers its client on the calling thread, and `record_function`
+        # calls from any other thread are silently dropped. Starting via
+        # `asyncio.to_thread` would run it on an executor worker and break all
+        # user-defined profiling scopes on the frontend.
         if self.profiler is not None:
-            coros.append(asyncio.to_thread(self.profiler.start))
-        await asyncio.gather(*coros)
+            self.profiler.start()
+        await self.engine_core.profile_async(True, profile_prefix)
 
     async def stop_profile(self) -> None:
-        coros = [self.engine_core.profile_async(False)]
+        await self.engine_core.profile_async(False)
         if self.profiler is not None:
-            coros.append(asyncio.to_thread(self.profiler.stop))
-        await asyncio.gather(*coros)
+            self.profiler.stop()
 
     async def reset_mm_cache(self) -> None:
         await self.renderer.clear_mm_cache_async()
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 4f3e192772ff..ecae4eac88ea 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -2853,7 +2853,7 @@ def _execute_mm_encoder(
                     video_mm_kwargs_item = mm_kwargs[current_item_idx + video_idx]
                     with self.timed_encoder_operation(
                         should_time, mm_lora_refs, current_item_idx + video_idx, 1
-                    ):
+                    ), record_function_or_nullcontext("mm_encoder: forward"):
                         _, _, micro_batch_mm_inputs = next(
                             group_and_batch_mm_kwargs(
                                 [video_mm_kwargs_item],
@@ -2880,7 +2880,7 @@ def _execute_mm_encoder(
 
                 with self.timed_encoder_operation(
                     should_time, mm_lora_refs, current_item_idx, num_items
-                ):
+                ), record_function_or_nullcontext("mm_encoder: forward"):
                     cudagraph_output = None
                     if (
                         self.encoder_cudagraph_manager is not None

From a793f3517557efcc2d108c63c75ec0e98beb0e22 Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 15:26:24 +0300
Subject: [PATCH 2/9] profile mm_processor on event loop thread

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 vllm/renderers/base.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py
index 9947f8c9187e..71bae81e7744 100644
--- a/vllm/renderers/base.py
+++ b/vllm/renderers/base.py
@@ -46,6 +46,7 @@
 from vllm.utils.counter import AtomicCounter
 from vllm.utils.torch_utils import set_default_torch_num_threads
 from vllm.v1.metrics.stats import MultiModalCacheStats
+from vllm.v1.utils import record_function_or_nullcontext
 
 from .embed_utils import safe_load_prompt_embeds
 from .inputs import (
@@ -741,14 +742,15 @@ async def _process_tokens_async(
 
         engine_input: TokensInput | MultiModalInput
         if multi_modal_data := prompt.get("multi_modal_data"):
-            engine_input = await self._process_multimodal_async(
-                prompt_token_ids,
-                multi_modal_data,
-                mm_processor_kwargs=prompt.get("mm_processor_kwargs"),
-                tokenization_kwargs=None,
-                mm_uuids=prompt.get("multi_modal_uuids"),
-                skip_mm_cache=skip_mm_cache,
-            )
+            with record_function_or_nullcontext("mm_processor: process_multimodal"):
+                engine_input = await self._process_multimodal_async(
+                    prompt_token_ids,
+                    multi_modal_data,
+                    mm_processor_kwargs=prompt.get("mm_processor_kwargs"),
+                    tokenization_kwargs=None,
+                    mm_uuids=prompt.get("multi_modal_uuids"),
+                    skip_mm_cache=skip_mm_cache,
+                )
         else:
             engine_input = tokens_input(prompt_token_ids)
 

From 914314f317da0e26db0cdf807cf1895539fa2cc2 Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 15:30:11 +0300
Subject: [PATCH 3/9] revert context.py: mm_processor sub-stage spans never
 emit on pool workers

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 vllm/multimodal/processing/context.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py
index 068c1ea19e69..ef9710374d81 100644
--- a/vllm/multimodal/processing/context.py
+++ b/vllm/multimodal/processing/context.py
@@ -24,7 +24,6 @@
 from vllm.utils.func_utils import get_allowed_kwarg_only_overrides
 from vllm.utils.jsontree import JSONTree, json_map_leaves
 from vllm.utils.mistral import is_mistral_tokenizer
-from vllm.v1.utils import record_function_or_nullcontext
 
 if TYPE_CHECKING:
     from transformers.configuration_utils import PretrainedConfig
@@ -61,18 +60,17 @@ def total_secs(self) -> float:
     @contextmanager
     def record(self, stage: str):
         """Record the execution time for a processing stage."""
-        with record_function_or_nullcontext(f"mm_processor: {stage}"):
-            if not self.enabled:
-                yield
-                return
-
-            start_time = time.perf_counter()
-            try:
-                yield
-            finally:
-                elapsed = time.perf_counter() - start_time
-                self.stage_secs.setdefault(stage, 0.0)
-                self.stage_secs[stage] += elapsed
+        if not self.enabled:
+            yield
+            return
+
+        start_time = time.perf_counter()
+        try:
+            yield
+        finally:
+            elapsed = time.perf_counter() - start_time
+            self.stage_secs.setdefault(stage, 0.0)
+            self.stage_secs[stage] += elapsed
 
     def get_stats_dict(self):
         stats_dict = {

From 131129cb7e9465944d0fe24cb434fac8eb5a4af5 Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 15:38:55 +0300
Subject: [PATCH 4/9] profile pil_decode span in url download path

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 vllm/multimodal/media/connector.py | 16 +++++++++-------
 1 file changed, 9 insertions(+), 7 deletions(-)

diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py
index 7673497ba31b..4ce9532e4e64 100644
--- a/vllm/multimodal/media/connector.py
+++ b/vllm/multimodal/media/connector.py
@@ -337,10 +337,10 @@ async def load_from_url_async(
                 global_thread_pool, self._get_cached_bytes, url
             )
             if cached is not None:
-                future = loop.run_in_executor(
-                    global_thread_pool, media_io.load_bytes, cached
-                )
-                return await future
+                with record_function_or_nullcontext("media: pil_decode"):
+                    return await loop.run_in_executor(
+                        global_thread_pool, media_io.load_bytes, cached
+                    )
 
             connection = self.connection
             with record_function_or_nullcontext("media: url_download"):
@@ -353,11 +353,13 @@ async def load_from_url_async(
             await loop.run_in_executor(
                 global_thread_pool, self._put_cached_bytes, url, data
             )
-            future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
-            return await future
+            with record_function_or_nullcontext("media: pil_decode"):
+                return await loop.run_in_executor(
+                    global_thread_pool, media_io.load_bytes, data
+                )
 
         if url_spec.scheme == "data":
-            future = loop.run_in_executor(
+            future: asyncio.Future[_M] = loop.run_in_executor(
                 global_thread_pool, self._load_data_url, url_spec, media_io
             )
             return await future

From d250a790e215fb880792488c05c078af159924ce Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 15:44:31 +0300
Subject: [PATCH 5/9] taskfile

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 Taskfile.yml | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 156 insertions(+)
 create mode 100644 Taskfile.yml

diff --git a/Taskfile.yml b/Taskfile.yml
new file mode 100644
index 000000000000..92087dc24945
--- /dev/null
+++ b/Taskfile.yml
@@ -0,0 +1,156 @@
+# https://taskfile.dev
+version: '3'
+
+vars:
+  MODEL: '{{.MODEL | default "llava-hf/llava-1.5-7b-hf"}}'
+  PORT: '{{.PORT | default "8000"}}'
+  TRACE_DIR: '{{.TRACE_DIR | default "/tmp/vllm-traces"}}'
+  PYTHON: .venv/bin/python
+  LOG_FILE: /tmp/vllm-server.log
+  # Stable public image used for ad-hoc profiling requests
+  IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}'
+
+tasks:
+  default:
+    desc: List all tasks
+    cmds:
+      - task --list
+    silent: true
+
+  # ── Server ────────────────────────────────────────────────────────────
+
+  serve:
+    desc: "Start vLLM with torch profiler + custom record_function scopes. Logs tee → {{.LOG_FILE}}"
+    cmds:
+      - mkdir -p {{.TRACE_DIR}}
+      - |
+        VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
+        .venv/bin/vllm serve {{.MODEL}} \
+          --port {{.PORT}} \
+          --profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}"}' \
+          2>&1 | tee {{.LOG_FILE}}
+
+  serve:plain:
+    desc: Start vLLM without profiler (baseline / smoke test)
+    cmds:
+      - .venv/bin/vllm serve {{.MODEL}} --port {{.PORT}} 2>&1 | tee {{.LOG_FILE}}
+
+  # ── Profiler control ──────────────────────────────────────────────────
+
+  profile:start:
+    desc: POST /start_profile — open a trace window
+    cmds:
+      - curl -fsS -X POST http://localhost:{{.PORT}}/start_profile && echo " profiler started"
+
+  profile:stop:
+    desc: POST /stop_profile — flush trace files to {{.TRACE_DIR}}
+    cmds:
+      - curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile && echo " profiler stopped"
+
+  profile:request:
+    desc: "Send one VLM chat request with an image URL (IMAGE_URL= to override)"
+    cmds:
+      - |
+        curl -s http://localhost:{{.PORT}}/v1/chat/completions \
+          -H "Content-Type: application/json" \
+          -d '{
+            "model": "{{.MODEL}}",
+            "messages": [{
+              "role": "user",
+              "content": [
+                {"type": "image_url", "image_url": {"url": "{{.IMAGE_URL}}"}},
+                {"type": "text", "text": "Describe this image in one sentence."}
+              ]
+            }],
+            "max_tokens": 64
+          }' | {{.PYTHON}} -m json.tool
+
+  profile:run:
+    desc: "One-shot profiling: start → one VLM request → stop (captures one request end-to-end)"
+    cmds:
+      - task: profile:start
+      - task: profile:request
+      - task: profile:stop
+      - task: traces:ls
+
+  # ── Stress test ───────────────────────────────────────────────────────
+
+  stress:
+    desc: "Stress test the render endpoint. Override: N=50 C=8 task stress"
+    vars:
+      N: '{{.N | default "20"}}'
+      C: '{{.C | default "4"}}'
+    cmds:
+      - |
+        {{.PYTHON}} stress_test_render.py \
+          --url http://localhost:{{.PORT}} \
+          --n {{.N}} \
+          --concurrency {{.C}} \
+          --mode chat
+
+  # ── Traces ────────────────────────────────────────────────────────────
+
+  traces:ls:
+    desc: List trace files in TRACE_DIR, split by frontend vs GPU worker
+    cmds:
+      - |
+        echo "=== Traces in {{.TRACE_DIR}} ==="
+        echo ""
+        echo "Frontend (event-loop: url_download, mm_processor, prefill/decode labels):"
+        ls -lht {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null || echo "  (none)"
+        echo ""
+        echo "GPU worker (mm_encoder:forward, execute_model, CUDA kernels):"
+        ls -lht {{.TRACE_DIR}}/rank*.*.json.gz 2>/dev/null || echo "  (none)"
+    silent: true
+
+  traces:open:
+    desc: Open Perfetto UI and print the latest frontend + GPU trace paths to upload
+    cmds:
+      - open https://ui.perfetto.dev
+      - |
+        echo ""
+        echo "Drag & drop both files into Perfetto (or File → Open Trace File):"
+        echo ""
+        printf "  Frontend : "
+        ls -t {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null | head -1 || echo "(none)"
+        printf "  GPU      : "
+        ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1 || echo "(none)"
+        echo ""
+        echo "Both traces share wall-clock time and will align on the same timeline."
+    silent: true
+
+  traces:unzip:
+    desc: Decompress all .gz traces (needed by tools that require raw JSON)
+    cmds:
+      - |
+        found=0
+        for f in {{.TRACE_DIR}}/*.json.gz; do
+          [ -f "$f" ] || continue
+          found=1
+          out="${f%.gz}"
+          if [ ! -f "$out" ]; then
+            gunzip -k "$f" && echo "Decompressed: $out"
+          else
+            echo "Already exists: $out"
+          fi
+        done
+        [ "$found" -eq 0 ] && echo "No .gz traces found in {{.TRACE_DIR}}"
+
+  traces:clean:
+    desc: Remove all traces from TRACE_DIR
+    prompt: "Delete all traces in {{.TRACE_DIR}}?"
+    cmds:
+      - rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.json
+      - echo "Cleaned {{.TRACE_DIR}}"
+
+  # ── Logs ──────────────────────────────────────────────────────────────
+
+  logs:
+    desc: "Tail the server log (task serve tees output to {{.LOG_FILE}})"
+    cmds:
+      - tail -f {{.LOG_FILE}}
+
+  logs:grep:
+    desc: "Search server log for keyword: PATTERN=mm_processor task logs:grep"
+    cmds:
+      - grep -i "{{.PATTERN}}" {{.LOG_FILE}} | tail -100

From 973cca943e3f8110b24c75198fdb1207c33ebc8b Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 17:49:04 +0300
Subject: [PATCH 6/9] profiler: add summary_trace.py and experiment tasks

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 Taskfile.yml           | 165 ++++++++++++++++++++++++++++++++++++++++-
 tools/summary_trace.py | 119 +++++++++++++++++++++++++++++
 2 files changed, 282 insertions(+), 2 deletions(-)
 create mode 100644 tools/summary_trace.py

diff --git a/Taskfile.yml b/Taskfile.yml
index 92087dc24945..e1dfefd2019f 100644
--- a/Taskfile.yml
+++ b/Taskfile.yml
@@ -2,13 +2,18 @@
 version: '3'
 
 vars:
-  MODEL: '{{.MODEL | default "llava-hf/llava-1.5-7b-hf"}}'
+  MODEL: '{{.MODEL | default "google/gemma-3-27b-it"}}'
   PORT: '{{.PORT | default "8000"}}'
   TRACE_DIR: '{{.TRACE_DIR | default "/tmp/vllm-traces"}}'
+  LOCAL_TRACE_DIR: '{{.LOCAL_TRACE_DIR | default "./traces"}}'
   PYTHON: .venv/bin/python
   LOG_FILE: /tmp/vllm-server.log
+  # false = only custom spans visible; true = full Python stacks (needed for flamegraph)
+  WITH_STACK: '{{.WITH_STACK | default "true"}}'
   # Stable public image used for ad-hoc profiling requests
   IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}'
+  # High-res public domain image (~8 MP)
+  IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg"}}'
 
 tasks:
   default:
@@ -27,7 +32,7 @@ tasks:
         VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
         .venv/bin/vllm serve {{.MODEL}} \
           --port {{.PORT}} \
-          --profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}"}' \
+          --profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}","torch_profiler_with_stack":{{.WITH_STACK}},"max_iterations":1}' \
           2>&1 | tee {{.LOG_FILE}}
 
   serve:plain:
@@ -90,6 +95,31 @@ tasks:
 
   # ── Traces ────────────────────────────────────────────────────────────
 
+  traces:pull:
+    desc: "Copy latest traces from pod → LOCAL_TRACE_DIR (default ./traces)"
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+    cmds:
+      - mkdir -p {{.LOCAL_TRACE_DIR}}
+      - |
+        frontend=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/*.async_llm.*.json.gz 2>/dev/null | head -1")
+        gpu=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1")
+        [ -z "$frontend" ] && echo "No frontend trace found" && exit 1
+        [ -z "$gpu" ]      && echo "No GPU trace found" && exit 1
+        oc cp {{.POD}}:$frontend {{.LOCAL_TRACE_DIR}}/frontend.json.gz
+        oc cp {{.POD}}:$gpu      {{.LOCAL_TRACE_DIR}}/rank0.json.gz
+        echo "Pulled to {{.LOCAL_TRACE_DIR}}/"
+        ls -lh {{.LOCAL_TRACE_DIR}}/*.json.gz
+
+  traces:summary:
+    desc: "Build summary.json.gz from frontend + GPU traces (LOCAL_TRACE_DIR= to override)"
+    cmds:
+      - |
+        {{.PYTHON}} tools/summary_trace.py \
+          {{.LOCAL_TRACE_DIR}}/frontend.json.gz \
+          {{.LOCAL_TRACE_DIR}}/rank0.json.gz \
+          {{.LOCAL_TRACE_DIR}}/summary.json.gz
+
   traces:ls:
     desc: List trace files in TRACE_DIR, split by frontend vs GPU worker
     cmds:
@@ -143,6 +173,137 @@ tasks:
       - rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.json
       - echo "Cleaned {{.TRACE_DIR}}"
 
+  # ── Experiments ───────────────────────────────────────────────────────
+  # Each experiment restarts the server to ensure a cold cache, then captures
+  # one profiled request and saves traces to traces/expN/.
+  #
+  # Experiments:
+  #   1. small image  — URL
+  #   2. hi-res image — URL
+  #   3. small image  — base64
+  #   4. hi-res image — base64
+
+  _exp:restart-server:
+    internal: true
+    desc: Kill + restart vllm in pod with profiling, wait until ready
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+    cmds:
+      - |
+        oc exec {{.POD}} -- bash -c "
+          kill \$(pgrep -f 'vllm serve') 2>/dev/null || true; sleep 3
+          rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.txt
+          nohup env VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
+            HF_TOKEN='$HF_TOKEN' HUGGING_FACE_HUB_TOKEN='$HF_TOKEN' \
+            /workspace/vllm/.venv/bin/vllm serve {{.MODEL}} \
+            --port {{.PORT}} \
+            --profiler-config '{\"profiler\":\"torch\",\"torch_profiler_dir\":\"{{.TRACE_DIR}}\",\"torch_profiler_with_stack\":{{.WITH_STACK}},\"max_iterations\":1}' \
+            > /tmp/vllm-serve.log 2>&1 &
+          echo \"Server PID \$!\"
+          until curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do
+            sleep 5
+          done
+          echo ready"
+
+  _exp:capture:
+    internal: true
+    desc: "Profile one URL-based request. Vars: IMAGE_URL, OUTDIR"
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+    cmds:
+      - |
+        oc exec {{.POD}} -- bash -c "
+          curl -fsS -X POST http://localhost:{{.PORT}}/start_profile
+          curl -sf http://localhost:{{.PORT}}/v1/chat/completions \
+            -H 'Content-Type: application/json' \
+            -d '{\"model\":\"{{.MODEL}}\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"image_url\",\"image_url\":{\"url\":\"{{.IMAGE_URL}}\"}},{\"type\":\"text\",\"text\":\"Describe this image in one sentence.\"}]}],\"max_tokens\":64}' \
+            > /dev/null
+          curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile"
+      - task: traces:pull
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+      - task: traces:summary
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+
+  _exp:capture-base64:
+    internal: true
+    # Download and encode the image on the pod so the payload never crosses the
+    # local→pod boundary. Large hi-res images (several MB base64) would exceed
+    # shell ARG_MAX if constructed locally and passed via oc exec bash -c "...".
+    desc: "Profile one base64 request. Vars: IMAGE_URL, OUTDIR"
+    vars:
+      POD: '{{.POD | default "vllm-dev"}}'
+    cmds:
+      - |
+        oc exec -i {{.POD}} -- python3 - <<'PYEOF'
+import base64, json, urllib.request as R
+
+base  = "http://localhost:{{.PORT}}"
+url   = "{{.IMAGE_URL}}"
+model = "{{.MODEL}}"
+
+b64  = base64.b64encode(R.urlopen(url).read()).decode()
+body = json.dumps({
+    "model": model,
+    "messages": [{"role": "user", "content": [
+        {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + b64}},
+        {"type": "text", "text": "Describe this image in one sentence."},
+    ]}],
+    "max_tokens": 64,
+}).encode()
+
+R.urlopen(R.Request(f"{base}/start_profile", method="POST"))
+R.urlopen(R.Request(f"{base}/v1/chat/completions", body, {"Content-Type": "application/json"}))
+R.urlopen(R.Request(f"{base}/stop_profile", method="POST"))
+PYEOF
+      - task: traces:pull
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+      - task: traces:summary
+        vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
+
+  exp:1:
+    desc: "Experiment 1 — small image, URL delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL}}"
+          OUTDIR: ./traces/exp1_small_url
+
+  exp:2:
+    desc: "Experiment 2 — hi-res image, URL delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL_HIRES}}"
+          OUTDIR: ./traces/exp2_hires_url
+
+  exp:3:
+    desc: "Experiment 3 — small image, base64 delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture-base64
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL}}"
+          OUTDIR: ./traces/exp3_small_base64
+
+  exp:4:
+    desc: "Experiment 4 — hi-res image, base64 delivery"
+    cmds:
+      - task: _exp:restart-server
+      - task: _exp:capture-base64
+        vars:
+          IMAGE_URL: "{{.IMAGE_URL_HIRES}}"
+          OUTDIR: ./traces/exp4_hires_base64
+
+  exp:all:
+    desc: "Run all 4 experiments sequentially (server restart between each)"
+    cmds:
+      - task: exp:1
+      - task: exp:2
+      - task: exp:3
+      - task: exp:4
+
   # ── Logs ──────────────────────────────────────────────────────────────
 
   logs:
diff --git a/tools/summary_trace.py b/tools/summary_trace.py
new file mode 100644
index 000000000000..698584107d40
--- /dev/null
+++ b/tools/summary_trace.py
@@ -0,0 +1,119 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Build a Perfetto summary trace from vLLM frontend + GPU worker traces.
+
+Extracts the key VLM pipeline spans and aligns them by wall-clock time.
+Both traces use CLOCK_MONOTONIC microseconds (same machine), so timestamps
+are directly comparable — no clock conversion needed.
+
+Output tracks:
+  Frontend   — media: url_download, media: pil_decode, mm_processor: process_multimodal
+  GPU Worker — mm_encoder: forward, gpu_model_runner: forward
+
+Usage:
+    python tools/summary_trace.py \\
+        traces/frontend.json.gz traces/rank0.json.gz traces/summary.json.gz
+"""
+
+import argparse
+import gzip
+import json
+from pathlib import Path
+
+FRONTEND_SPANS = {
+    "media: url_download",
+    "media: pil_decode",
+    "mm_processor: process_multimodal",
+}
+
+GPU_SPANS = {
+    "mm_encoder: forward",
+    "gpu_model_runner: forward",
+}
+
+TID_FRONTEND = 1
+TID_GPU = 2
+
+
+def load_spans(path: Path, names: set) -> list:
+    with gzip.open(path) as f:
+        data = json.load(f)
+    return [
+        e for e in data["traceEvents"] if e.get("ph") == "X" and e.get("name") in names
+    ]
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter
+    )
+    parser.add_argument(
+        "frontend", type=Path, help="Frontend trace (async_llm.*.json.gz)"
+    )
+    parser.add_argument("gpu", type=Path, help="GPU worker trace (rank0.*.json.gz)")
+    parser.add_argument("output", type=Path, help="Output summary trace (.json.gz)")
+    args = parser.parse_args()
+
+    frontend_spans = load_spans(args.frontend, FRONTEND_SPANS)
+    gpu_spans = load_spans(args.gpu, GPU_SPANS)
+
+    all_spans = frontend_spans + gpu_spans
+    if not all_spans:
+        raise SystemExit(
+            "No matching spans found — check that VLLM_CUSTOM_SCOPES_FOR_PROFILING=1"
+        )
+
+    # Normalize timestamps: t=0 is the start of the earliest span.
+    t0 = min(e["ts"] for e in all_spans)
+
+    events = []
+    for e in all_spans:
+        tid = TID_GPU if e["name"] in GPU_SPANS else TID_FRONTEND
+        events.append(
+            {
+                "ph": "X",
+                "cat": "user_annotation",
+                "name": e["name"],
+                "pid": 1,
+                "tid": tid,
+                "ts": e["ts"] - t0,
+                "dur": e["dur"],
+            }
+        )
+
+    events += [
+        {
+            "ph": "M",
+            "pid": 1,
+            "tid": TID_FRONTEND,
+            "name": "thread_name",
+            "args": {"name": "Frontend"},
+        },
+        {
+            "ph": "M",
+            "pid": 1,
+            "tid": TID_GPU,
+            "name": "thread_name",
+            "args": {"name": "GPU Worker"},
+        },
+    ]
+
+    args.output.parent.mkdir(parents=True, exist_ok=True)
+    with gzip.open(args.output, "wt") as f:
+        json.dump({"traceEvents": events, "displayTimeUnit": "ms"}, f)
+
+    # Print summary table
+    span_events = sorted(
+        (e for e in events if e.get("ph") == "X"), key=lambda e: e["ts"]
+    )
+    print(f"\n{'Span':<45} {'start':>9}  {'dur':>9}")
+    print("-" * 67)
+    for e in span_events:
+        print(f"{e['name']:<45} {e['ts'] / 1000:>8.1f}ms  {e['dur'] / 1000:>8.1f}ms")
+    print(f"\nWritten: {args.output}")
+
+
+if __name__ == "__main__":
+    main()

From 272304d61d6e1e03ab3f968c74de2aaf717901ac Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 17:56:23 +0300
Subject: [PATCH 7/9] profiler: fix stop_profile ordering and add data: URL
 span

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 vllm/multimodal/media/connector.py | 21 ++++++++++-----------
 vllm/v1/engine/async_llm.py        |  9 ++++++++-
 2 files changed, 18 insertions(+), 12 deletions(-)

diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py
index 4ce9532e4e64..f09019923f17 100644
--- a/vllm/multimodal/media/connector.py
+++ b/vllm/multimodal/media/connector.py
@@ -301,12 +301,11 @@ def load_from_url(
                 return media_io.load_bytes(cached)
 
             connection = self.connection
-            with record_function_or_nullcontext("media: url_download"):
-                data = connection.get_bytes(
-                    url_spec.url,
-                    timeout=fetch_timeout,
-                    allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
-                )
+            data = connection.get_bytes(
+                url_spec.url,
+                timeout=fetch_timeout,
+                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
+            )
 
             self._put_cached_bytes(url, data)
             return media_io.load_bytes(data)
@@ -359,13 +358,13 @@ async def load_from_url_async(
                 )
 
         if url_spec.scheme == "data":
-            future: asyncio.Future[_M] = loop.run_in_executor(
-                global_thread_pool, self._load_data_url, url_spec, media_io
-            )
-            return await future
+            with record_function_or_nullcontext("media: data_decode"):
+                return await loop.run_in_executor(
+                    global_thread_pool, self._load_data_url, url_spec, media_io
+                )
 
         if url_spec.scheme == "file":
-            future = loop.run_in_executor(
+            future: asyncio.Future[_M] = loop.run_in_executor(
                 global_thread_pool, self._load_file_url, url_spec, media_io
             )
             return await future
diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py
index 6f7b79c87d29..0ab4d0147365 100644
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -881,9 +881,16 @@ async def start_profile(self, profile_prefix: str | None = None) -> None:
         await self.engine_core.profile_async(True, profile_prefix)
 
     async def stop_profile(self) -> None:
-        await self.engine_core.profile_async(False)
+        # Stop the frontend profiler first (same thread as start(), required by
+        # kineto), then tell the engine. Doing it the other way around would
+        # leave the frontend profiler open while awaiting the engine roundtrip,
+        # capturing idle asyncio noise after the request is already done.
+        # profiler.stop() calls on_trace_ready which writes the trace file —
+        # this is blocking I/O on the event loop, but stop_profile is a debug
+        # path so the brief pause is acceptable.
         if self.profiler is not None:
             self.profiler.stop()
+        await self.engine_core.profile_async(False)
 
     async def reset_mm_cache(self) -> None:
         await self.renderer.clear_mm_cache_async()

From f44933f6e20e5baf202f14f23771f5c27fbb63f9 Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 18:06:28 +0300
Subject: [PATCH 8/9] profiler: fix base64 capture task and add
 capture_request.py

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 Taskfile.yml             |  38 ++++------
 tools/capture_request.py | 148 +++++++++++++++++++++++++++++++++++++++
 2 files changed, 162 insertions(+), 24 deletions(-)
 create mode 100644 tools/capture_request.py

diff --git a/Taskfile.yml b/Taskfile.yml
index e1dfefd2019f..bd5a9c75d079 100644
--- a/Taskfile.yml
+++ b/Taskfile.yml
@@ -226,35 +226,25 @@ tasks:
 
   _exp:capture-base64:
     internal: true
-    # Download and encode the image on the pod so the payload never crosses the
-    # local→pod boundary. Large hi-res images (several MB base64) would exceed
-    # shell ARG_MAX if constructed locally and passed via oc exec bash -c "...".
+    # Use oc port-forward so capture_request.py runs locally with --base64,
+    # downloading the image locally and sending the encoded payload to the pod.
+    # This avoids heredoc-in-YAML issues and keeps Python code out of the Taskfile.
     desc: "Profile one base64 request. Vars: IMAGE_URL, OUTDIR"
     vars:
       POD: '{{.POD | default "vllm-dev"}}'
+      LOCAL_PORT: "18000"
     cmds:
       - |
-        oc exec -i {{.POD}} -- python3 - <<'PYEOF'
-import base64, json, urllib.request as R
-
-base  = "http://localhost:{{.PORT}}"
-url   = "{{.IMAGE_URL}}"
-model = "{{.MODEL}}"
-
-b64  = base64.b64encode(R.urlopen(url).read()).decode()
-body = json.dumps({
-    "model": model,
-    "messages": [{"role": "user", "content": [
-        {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + b64}},
-        {"type": "text", "text": "Describe this image in one sentence."},
-    ]}],
-    "max_tokens": 64,
-}).encode()
-
-R.urlopen(R.Request(f"{base}/start_profile", method="POST"))
-R.urlopen(R.Request(f"{base}/v1/chat/completions", body, {"Content-Type": "application/json"}))
-R.urlopen(R.Request(f"{base}/stop_profile", method="POST"))
-PYEOF
+        oc port-forward {{.POD}} {{.LOCAL_PORT}}:{{.PORT}} &
+        PFW=$!
+        trap "kill $PFW 2>/dev/null" EXIT
+        sleep 2
+        python3 tools/capture_request.py \
+          --endpoint http://localhost:{{.LOCAL_PORT}} \
+          --model    {{.MODEL}} \
+          --image    {{.IMAGE_URL}} \
+          --base64
+        kill $PFW 2>/dev/null || true
       - task: traces:pull
         vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" }
       - task: traces:summary
diff --git a/tools/capture_request.py b/tools/capture_request.py
new file mode 100644
index 000000000000..ae6fc7e73524
--- /dev/null
+++ b/tools/capture_request.py
@@ -0,0 +1,148 @@
+#!/usr/bin/env python3
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+Send one profiled VLM request to a running vLLM server.
+
+Wraps the request with /start_profile and /stop_profile so the trace
+contains exactly one request. Supports URL and base64 image delivery.
+
+Usage:
+    # URL mode (server fetches the image):
+    python tools/capture_request.py \\
+        --endpoint http://localhost:8000 \\
+        --model google/gemma-3-27b-it \\
+        --image https://example.com/cat.jpg
+
+    # base64 mode (image encoded and embedded in the request body):
+    python tools/capture_request.py \\
+        --endpoint http://localhost:8000 \\
+        --model google/gemma-3-27b-it \\
+        --image https://example.com/cat.jpg \\
+        --base64
+"""
+
+import argparse
+import json
+import sys
+import urllib.request as R
+from urllib.error import URLError
+
+import pybase64 as base64
+
+
+def _post(
+    url: str, body: bytes | None = None, content_type: str | None = None
+) -> bytes:
+    headers = {}
+    if content_type:
+        headers["Content-Type"] = content_type
+    req = R.Request(
+        url, data=body, headers=headers, method="POST" if body is None else None
+    )
+    req.method = "POST"
+    try:
+        return R.urlopen(req).read()
+    except URLError as e:
+        print(f"ERROR: {url} → {e}", file=sys.stderr)
+        raise
+
+
+def build_image_content(image_url: str, base64_mode: bool) -> dict:
+    if not base64_mode:
+        return {"type": "image_url", "image_url": {"url": image_url}}
+
+    print(f"Downloading {image_url} for base64 encoding...", file=sys.stderr)
+    img_bytes = R.urlopen(image_url).read()
+
+    # Infer MIME type from URL extension; default to jpeg.
+    lower = image_url.lower()
+    if lower.endswith(".png"):
+        mime = "image/png"
+    elif lower.endswith(".webp"):
+        mime = "image/webp"
+    elif lower.endswith(".gif"):
+        mime = "image/gif"
+    else:
+        mime = "image/jpeg"
+
+    b64 = base64.b64encode(img_bytes).decode()
+    print(
+        f"Encoded {len(img_bytes):,} bytes → {len(b64):,} base64 chars  mime={mime}",
+        file=sys.stderr,
+    )
+    return {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}}
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.RawDescriptionHelpFormatter,
+    )
+    parser.add_argument(
+        "--endpoint",
+        default="http://localhost:8000",
+        help="vLLM server base URL (default: http://localhost:8000)",
+    )
+    parser.add_argument("--model", required=True, help="Model name as served by vLLM")
+    parser.add_argument(
+        "--image",
+        required=True,
+        metavar="URL",
+        help="Image URL to send (fetched from server for URL mode, "
+        "or downloaded locally and base64-encoded for --base64)",
+    )
+    parser.add_argument(
+        "--base64",
+        action="store_true",
+        help="Download image locally and embed as base64 data URL",
+    )
+    parser.add_argument(
+        "--prompt",
+        default="Describe this image in one sentence.",
+        help="Text prompt to send alongside the image",
+    )
+    parser.add_argument("--max-tokens", type=int, default=64)
+    args = parser.parse_args()
+
+    base = args.endpoint.rstrip("/")
+    image_content = build_image_content(args.image, args.base64)
+
+    payload = json.dumps(
+        {
+            "model": args.model,
+            "messages": [
+                {
+                    "role": "user",
+                    "content": [
+                        image_content,
+                        {"type": "text", "text": args.prompt},
+                    ],
+                }
+            ],
+            "max_tokens": args.max_tokens,
+        }
+    ).encode()
+
+    print("Starting profiler...", file=sys.stderr)
+    _post(f"{base}/start_profile")
+
+    print("Sending request...", file=sys.stderr)
+    resp_bytes = _post(f"{base}/v1/chat/completions", payload, "application/json")
+    result = json.loads(resp_bytes)
+    content = result["choices"][0]["message"]["content"]
+    usage = result.get("usage", {})
+    prompt_t = usage.get("prompt_tokens", 0)
+    compl_t = usage.get("completion_tokens", 0)
+    print(
+        f"Response ({prompt_t}→{compl_t} tokens): {content[:120]}",
+        file=sys.stderr,
+    )
+
+    print("Stopping profiler...", file=sys.stderr)
+    _post(f"{base}/stop_profile")
+    print("Done.", file=sys.stderr)
+
+
+if __name__ == "__main__":
+    main()

From 2af7c78a7dcd424d84d6c3933ae443fd0e982bae Mon Sep 17 00:00:00 2001
From: Sage Ahrac <sagiahrak@gmail.com>
Date: Mon, 13 Apr 2026 18:48:02 +0300
Subject: [PATCH 9/9] profiler: fix kill pattern, add data_decode span, fix
 image URLs

Signed-off-by: Sage Ahrac <sagiahrak@gmail.com>
---
 Taskfile.yml             | 23 +++++++++++++++--------
 tools/capture_request.py |  5 +----
 tools/summary_trace.py   |  6 ++++--
 3 files changed, 20 insertions(+), 14 deletions(-)

diff --git a/Taskfile.yml b/Taskfile.yml
index bd5a9c75d079..bb30410b9179 100644
--- a/Taskfile.yml
+++ b/Taskfile.yml
@@ -13,7 +13,7 @@ vars:
   # Stable public image used for ad-hoc profiling requests
   IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}'
   # High-res public domain image (~8 MP)
-  IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg"}}'
+  IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://picsum.photos/seed/vllm/3000/2000.jpg"}}'
 
 tasks:
   default:
@@ -189,9 +189,13 @@ tasks:
     vars:
       POD: '{{.POD | default "vllm-dev"}}'
     cmds:
+      # Step 1a: kill old server.
+      # Run pkill directly (no bash -c wrapper) so the search pattern does not
+      # appear in any wrapper process's cmdline. pkill always excludes itself.
+      - oc exec {{.POD}} -- pkill -f /workspace/vllm/.venv/bin/vllm || true
+      # Step 1b: wipe old traces and launch new server (no sleep, no wait)
       - |
         oc exec {{.POD}} -- bash -c "
-          kill \$(pgrep -f 'vllm serve') 2>/dev/null || true; sleep 3
           rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.txt
           nohup env VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \
             HF_TOKEN='$HF_TOKEN' HUGGING_FACE_HUB_TOKEN='$HF_TOKEN' \
@@ -199,11 +203,14 @@ tasks:
             --port {{.PORT}} \
             --profiler-config '{\"profiler\":\"torch\",\"torch_profiler_dir\":\"{{.TRACE_DIR}}\",\"torch_profiler_with_stack\":{{.WITH_STACK}},\"max_iterations\":1}' \
             > /tmp/vllm-serve.log 2>&1 &
-          echo \"Server PID \$!\"
-          until curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do
-            sleep 5
-          done
-          echo ready"
+          echo \"Server PID \$!\""
+      # Step 2: poll locally so oc exec never sits open for 2+ minutes
+      - |
+        echo "Waiting for server on {{.POD}}:{{.PORT}}..."
+        until oc exec {{.POD}} -- curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do
+          sleep 10
+        done
+        echo ready
 
   _exp:capture:
     internal: true
@@ -239,7 +246,7 @@ tasks:
         PFW=$!
         trap "kill $PFW 2>/dev/null" EXIT
         sleep 2
-        python3 tools/capture_request.py \
+        {{.PYTHON}} tools/capture_request.py \
           --endpoint http://localhost:{{.LOCAL_PORT}} \
           --model    {{.MODEL}} \
           --image    {{.IMAGE_URL}} \
diff --git a/tools/capture_request.py b/tools/capture_request.py
index ae6fc7e73524..ac61e6f4be7a 100644
--- a/tools/capture_request.py
+++ b/tools/capture_request.py
@@ -37,10 +37,7 @@ def _post(
     headers = {}
     if content_type:
         headers["Content-Type"] = content_type
-    req = R.Request(
-        url, data=body, headers=headers, method="POST" if body is None else None
-    )
-    req.method = "POST"
+    req = R.Request(url, data=body, headers=headers, method="POST")
     try:
         return R.urlopen(req).read()
     except URLError as e:
diff --git a/tools/summary_trace.py b/tools/summary_trace.py
index 698584107d40..6f92835da961 100644
--- a/tools/summary_trace.py
+++ b/tools/summary_trace.py
@@ -9,7 +9,8 @@
 are directly comparable — no clock conversion needed.
 
 Output tracks:
-  Frontend   — media: url_download, media: pil_decode, mm_processor: process_multimodal
+  Frontend   — media: url_download / media: data_decode,
+               media: pil_decode, mm_processor: process_multimodal
   GPU Worker — mm_encoder: forward, gpu_model_runner: forward
 
 Usage:
@@ -23,7 +24,8 @@
 from pathlib import Path
 
 FRONTEND_SPANS = {
-    "media: url_download",
+    "media: url_download",  # HTTP URL path
+    "media: data_decode",  # base64 / data: URL path
     "media: pil_decode",
     "mm_processor: process_multimodal",
 }