From 8fc2a7859b6e55682fecde5aff5f76354d45051f Mon Sep 17 00:00:00 2001 From: sagearc Date: Mon, 13 Apr 2026 12:18:26 +0000 Subject: [PATCH 1/9] profile encoder --- vllm/multimodal/media/connector.py | 23 +++++++++++++---------- vllm/multimodal/processing/context.py | 24 +++++++++++++----------- vllm/v1/engine/async_llm.py | 15 +++++++++------ vllm/v1/worker/gpu_model_runner.py | 4 ++-- 4 files changed, 37 insertions(+), 29 deletions(-) diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py index 6ec90a45f1d0..7673497ba31b 100644 --- a/vllm/multimodal/media/connector.py +++ b/vllm/multimodal/media/connector.py @@ -23,6 +23,7 @@ from vllm.connections import HTTPConnection, global_http_connection from vllm.logger import init_logger from vllm.utils.registry import ExtensionManager +from vllm.v1.utils import record_function_or_nullcontext from .audio import AudioEmbeddingMediaIO, AudioMediaIO from .base import MediaIO @@ -300,11 +301,12 @@ def load_from_url( return media_io.load_bytes(cached) connection = self.connection - data = connection.get_bytes( - url_spec.url, - timeout=fetch_timeout, - allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS, - ) + with record_function_or_nullcontext("media: url_download"): + data = connection.get_bytes( + url_spec.url, + timeout=fetch_timeout, + allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS, + ) self._put_cached_bytes(url, data) return media_io.load_bytes(data) @@ -341,11 +343,12 @@ async def load_from_url_async( return await future connection = self.connection - data = await connection.async_get_bytes( - url_spec.url, - timeout=fetch_timeout, - allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS, - ) + with record_function_or_nullcontext("media: url_download"): + data = await connection.async_get_bytes( + url_spec.url, + timeout=fetch_timeout, + allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS, + ) await loop.run_in_executor( global_thread_pool, self._put_cached_bytes, url, data diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py index ef9710374d81..068c1ea19e69 100644 --- a/vllm/multimodal/processing/context.py +++ b/vllm/multimodal/processing/context.py @@ -24,6 +24,7 @@ from vllm.utils.func_utils import get_allowed_kwarg_only_overrides from vllm.utils.jsontree import JSONTree, json_map_leaves from vllm.utils.mistral import is_mistral_tokenizer +from vllm.v1.utils import record_function_or_nullcontext if TYPE_CHECKING: from transformers.configuration_utils import PretrainedConfig @@ -60,17 +61,18 @@ def total_secs(self) -> float: @contextmanager def record(self, stage: str): """Record the execution time for a processing stage.""" - if not self.enabled: - yield - return - - start_time = time.perf_counter() - try: - yield - finally: - elapsed = time.perf_counter() - start_time - self.stage_secs.setdefault(stage, 0.0) - self.stage_secs[stage] += elapsed + with record_function_or_nullcontext(f"mm_processor: {stage}"): + if not self.enabled: + yield + return + + start_time = time.perf_counter() + try: + yield + finally: + elapsed = time.perf_counter() - start_time + self.stage_secs.setdefault(stage, 0.0) + self.stage_secs[stage] += elapsed def get_stats_dict(self): stats_dict = { diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 1c87d9ec0944..6f7b79c87d29 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -871,16 +871,19 @@ async def check_health(self) -> None: raise self.dead_error async def start_profile(self, profile_prefix: str | None = None) -> None: - coros = [self.engine_core.profile_async(True, profile_prefix)] + # The torch profiler must be started on the event-loop thread: kineto + # registers its client on the calling thread, and `record_function` + # calls from any other thread are silently dropped. Starting via + # `asyncio.to_thread` would run it on an executor worker and break all + # user-defined profiling scopes on the frontend. if self.profiler is not None: - coros.append(asyncio.to_thread(self.profiler.start)) - await asyncio.gather(*coros) + self.profiler.start() + await self.engine_core.profile_async(True, profile_prefix) async def stop_profile(self) -> None: - coros = [self.engine_core.profile_async(False)] + await self.engine_core.profile_async(False) if self.profiler is not None: - coros.append(asyncio.to_thread(self.profiler.stop)) - await asyncio.gather(*coros) + self.profiler.stop() async def reset_mm_cache(self) -> None: await self.renderer.clear_mm_cache_async() diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py index 4f3e192772ff..ecae4eac88ea 100644 --- a/vllm/v1/worker/gpu_model_runner.py +++ b/vllm/v1/worker/gpu_model_runner.py @@ -2853,7 +2853,7 @@ def _execute_mm_encoder( video_mm_kwargs_item = mm_kwargs[current_item_idx + video_idx] with self.timed_encoder_operation( should_time, mm_lora_refs, current_item_idx + video_idx, 1 - ): + ), record_function_or_nullcontext("mm_encoder: forward"): _, _, micro_batch_mm_inputs = next( group_and_batch_mm_kwargs( [video_mm_kwargs_item], @@ -2880,7 +2880,7 @@ def _execute_mm_encoder( with self.timed_encoder_operation( should_time, mm_lora_refs, current_item_idx, num_items - ): + ), record_function_or_nullcontext("mm_encoder: forward"): cudagraph_output = None if ( self.encoder_cudagraph_manager is not None From a793f3517557efcc2d108c63c75ec0e98beb0e22 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 15:26:24 +0300 Subject: [PATCH 2/9] profile mm_processor on event loop thread Signed-off-by: Sage Ahrac --- vllm/renderers/base.py | 18 ++++++++++-------- 1 file changed, 10 insertions(+), 8 deletions(-) diff --git a/vllm/renderers/base.py b/vllm/renderers/base.py index 9947f8c9187e..71bae81e7744 100644 --- a/vllm/renderers/base.py +++ b/vllm/renderers/base.py @@ -46,6 +46,7 @@ from vllm.utils.counter import AtomicCounter from vllm.utils.torch_utils import set_default_torch_num_threads from vllm.v1.metrics.stats import MultiModalCacheStats +from vllm.v1.utils import record_function_or_nullcontext from .embed_utils import safe_load_prompt_embeds from .inputs import ( @@ -741,14 +742,15 @@ async def _process_tokens_async( engine_input: TokensInput | MultiModalInput if multi_modal_data := prompt.get("multi_modal_data"): - engine_input = await self._process_multimodal_async( - prompt_token_ids, - multi_modal_data, - mm_processor_kwargs=prompt.get("mm_processor_kwargs"), - tokenization_kwargs=None, - mm_uuids=prompt.get("multi_modal_uuids"), - skip_mm_cache=skip_mm_cache, - ) + with record_function_or_nullcontext("mm_processor: process_multimodal"): + engine_input = await self._process_multimodal_async( + prompt_token_ids, + multi_modal_data, + mm_processor_kwargs=prompt.get("mm_processor_kwargs"), + tokenization_kwargs=None, + mm_uuids=prompt.get("multi_modal_uuids"), + skip_mm_cache=skip_mm_cache, + ) else: engine_input = tokens_input(prompt_token_ids) From 914314f317da0e26db0cdf807cf1895539fa2cc2 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 15:30:11 +0300 Subject: [PATCH 3/9] revert context.py: mm_processor sub-stage spans never emit on pool workers Signed-off-by: Sage Ahrac --- vllm/multimodal/processing/context.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/vllm/multimodal/processing/context.py b/vllm/multimodal/processing/context.py index 068c1ea19e69..ef9710374d81 100644 --- a/vllm/multimodal/processing/context.py +++ b/vllm/multimodal/processing/context.py @@ -24,7 +24,6 @@ from vllm.utils.func_utils import get_allowed_kwarg_only_overrides from vllm.utils.jsontree import JSONTree, json_map_leaves from vllm.utils.mistral import is_mistral_tokenizer -from vllm.v1.utils import record_function_or_nullcontext if TYPE_CHECKING: from transformers.configuration_utils import PretrainedConfig @@ -61,18 +60,17 @@ def total_secs(self) -> float: @contextmanager def record(self, stage: str): """Record the execution time for a processing stage.""" - with record_function_or_nullcontext(f"mm_processor: {stage}"): - if not self.enabled: - yield - return - - start_time = time.perf_counter() - try: - yield - finally: - elapsed = time.perf_counter() - start_time - self.stage_secs.setdefault(stage, 0.0) - self.stage_secs[stage] += elapsed + if not self.enabled: + yield + return + + start_time = time.perf_counter() + try: + yield + finally: + elapsed = time.perf_counter() - start_time + self.stage_secs.setdefault(stage, 0.0) + self.stage_secs[stage] += elapsed def get_stats_dict(self): stats_dict = { From 131129cb7e9465944d0fe24cb434fac8eb5a4af5 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 15:38:55 +0300 Subject: [PATCH 4/9] profile pil_decode span in url download path Signed-off-by: Sage Ahrac --- vllm/multimodal/media/connector.py | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py index 7673497ba31b..4ce9532e4e64 100644 --- a/vllm/multimodal/media/connector.py +++ b/vllm/multimodal/media/connector.py @@ -337,10 +337,10 @@ async def load_from_url_async( global_thread_pool, self._get_cached_bytes, url ) if cached is not None: - future = loop.run_in_executor( - global_thread_pool, media_io.load_bytes, cached - ) - return await future + with record_function_or_nullcontext("media: pil_decode"): + return await loop.run_in_executor( + global_thread_pool, media_io.load_bytes, cached + ) connection = self.connection with record_function_or_nullcontext("media: url_download"): @@ -353,11 +353,13 @@ async def load_from_url_async( await loop.run_in_executor( global_thread_pool, self._put_cached_bytes, url, data ) - future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data) - return await future + with record_function_or_nullcontext("media: pil_decode"): + return await loop.run_in_executor( + global_thread_pool, media_io.load_bytes, data + ) if url_spec.scheme == "data": - future = loop.run_in_executor( + future: asyncio.Future[_M] = loop.run_in_executor( global_thread_pool, self._load_data_url, url_spec, media_io ) return await future From d250a790e215fb880792488c05c078af159924ce Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 15:44:31 +0300 Subject: [PATCH 5/9] taskfile Signed-off-by: Sage Ahrac --- Taskfile.yml | 156 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 156 insertions(+) create mode 100644 Taskfile.yml diff --git a/Taskfile.yml b/Taskfile.yml new file mode 100644 index 000000000000..92087dc24945 --- /dev/null +++ b/Taskfile.yml @@ -0,0 +1,156 @@ +# https://taskfile.dev +version: '3' + +vars: + MODEL: '{{.MODEL | default "llava-hf/llava-1.5-7b-hf"}}' + PORT: '{{.PORT | default "8000"}}' + TRACE_DIR: '{{.TRACE_DIR | default "/tmp/vllm-traces"}}' + PYTHON: .venv/bin/python + LOG_FILE: /tmp/vllm-server.log + # Stable public image used for ad-hoc profiling requests + IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}' + +tasks: + default: + desc: List all tasks + cmds: + - task --list + silent: true + + # ── Server ──────────────────────────────────────────────────────────── + + serve: + desc: "Start vLLM with torch profiler + custom record_function scopes. Logs tee → {{.LOG_FILE}}" + cmds: + - mkdir -p {{.TRACE_DIR}} + - | + VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \ + .venv/bin/vllm serve {{.MODEL}} \ + --port {{.PORT}} \ + --profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}"}' \ + 2>&1 | tee {{.LOG_FILE}} + + serve:plain: + desc: Start vLLM without profiler (baseline / smoke test) + cmds: + - .venv/bin/vllm serve {{.MODEL}} --port {{.PORT}} 2>&1 | tee {{.LOG_FILE}} + + # ── Profiler control ────────────────────────────────────────────────── + + profile:start: + desc: POST /start_profile — open a trace window + cmds: + - curl -fsS -X POST http://localhost:{{.PORT}}/start_profile && echo " profiler started" + + profile:stop: + desc: POST /stop_profile — flush trace files to {{.TRACE_DIR}} + cmds: + - curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile && echo " profiler stopped" + + profile:request: + desc: "Send one VLM chat request with an image URL (IMAGE_URL= to override)" + cmds: + - | + curl -s http://localhost:{{.PORT}}/v1/chat/completions \ + -H "Content-Type: application/json" \ + -d '{ + "model": "{{.MODEL}}", + "messages": [{ + "role": "user", + "content": [ + {"type": "image_url", "image_url": {"url": "{{.IMAGE_URL}}"}}, + {"type": "text", "text": "Describe this image in one sentence."} + ] + }], + "max_tokens": 64 + }' | {{.PYTHON}} -m json.tool + + profile:run: + desc: "One-shot profiling: start → one VLM request → stop (captures one request end-to-end)" + cmds: + - task: profile:start + - task: profile:request + - task: profile:stop + - task: traces:ls + + # ── Stress test ─────────────────────────────────────────────────────── + + stress: + desc: "Stress test the render endpoint. Override: N=50 C=8 task stress" + vars: + N: '{{.N | default "20"}}' + C: '{{.C | default "4"}}' + cmds: + - | + {{.PYTHON}} stress_test_render.py \ + --url http://localhost:{{.PORT}} \ + --n {{.N}} \ + --concurrency {{.C}} \ + --mode chat + + # ── Traces ──────────────────────────────────────────────────────────── + + traces:ls: + desc: List trace files in TRACE_DIR, split by frontend vs GPU worker + cmds: + - | + echo "=== Traces in {{.TRACE_DIR}} ===" + echo "" + echo "Frontend (event-loop: url_download, mm_processor, prefill/decode labels):" + ls -lht {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null || echo " (none)" + echo "" + echo "GPU worker (mm_encoder:forward, execute_model, CUDA kernels):" + ls -lht {{.TRACE_DIR}}/rank*.*.json.gz 2>/dev/null || echo " (none)" + silent: true + + traces:open: + desc: Open Perfetto UI and print the latest frontend + GPU trace paths to upload + cmds: + - open https://ui.perfetto.dev + - | + echo "" + echo "Drag & drop both files into Perfetto (or File → Open Trace File):" + echo "" + printf " Frontend : " + ls -t {{.TRACE_DIR}}/async_llm.*.json.gz 2>/dev/null | head -1 || echo "(none)" + printf " GPU : " + ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1 || echo "(none)" + echo "" + echo "Both traces share wall-clock time and will align on the same timeline." + silent: true + + traces:unzip: + desc: Decompress all .gz traces (needed by tools that require raw JSON) + cmds: + - | + found=0 + for f in {{.TRACE_DIR}}/*.json.gz; do + [ -f "$f" ] || continue + found=1 + out="${f%.gz}" + if [ ! -f "$out" ]; then + gunzip -k "$f" && echo "Decompressed: $out" + else + echo "Already exists: $out" + fi + done + [ "$found" -eq 0 ] && echo "No .gz traces found in {{.TRACE_DIR}}" + + traces:clean: + desc: Remove all traces from TRACE_DIR + prompt: "Delete all traces in {{.TRACE_DIR}}?" + cmds: + - rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.json + - echo "Cleaned {{.TRACE_DIR}}" + + # ── Logs ────────────────────────────────────────────────────────────── + + logs: + desc: "Tail the server log (task serve tees output to {{.LOG_FILE}})" + cmds: + - tail -f {{.LOG_FILE}} + + logs:grep: + desc: "Search server log for keyword: PATTERN=mm_processor task logs:grep" + cmds: + - grep -i "{{.PATTERN}}" {{.LOG_FILE}} | tail -100 From 973cca943e3f8110b24c75198fdb1207c33ebc8b Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 17:49:04 +0300 Subject: [PATCH 6/9] profiler: add summary_trace.py and experiment tasks Signed-off-by: Sage Ahrac --- Taskfile.yml | 165 ++++++++++++++++++++++++++++++++++++++++- tools/summary_trace.py | 119 +++++++++++++++++++++++++++++ 2 files changed, 282 insertions(+), 2 deletions(-) create mode 100644 tools/summary_trace.py diff --git a/Taskfile.yml b/Taskfile.yml index 92087dc24945..e1dfefd2019f 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -2,13 +2,18 @@ version: '3' vars: - MODEL: '{{.MODEL | default "llava-hf/llava-1.5-7b-hf"}}' + MODEL: '{{.MODEL | default "google/gemma-3-27b-it"}}' PORT: '{{.PORT | default "8000"}}' TRACE_DIR: '{{.TRACE_DIR | default "/tmp/vllm-traces"}}' + LOCAL_TRACE_DIR: '{{.LOCAL_TRACE_DIR | default "./traces"}}' PYTHON: .venv/bin/python LOG_FILE: /tmp/vllm-server.log + # false = only custom spans visible; true = full Python stacks (needed for flamegraph) + WITH_STACK: '{{.WITH_STACK | default "true"}}' # Stable public image used for ad-hoc profiling requests IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}' + # High-res public domain image (~8 MP) + IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg"}}' tasks: default: @@ -27,7 +32,7 @@ tasks: VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \ .venv/bin/vllm serve {{.MODEL}} \ --port {{.PORT}} \ - --profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}"}' \ + --profiler-config '{"profiler":"torch","torch_profiler_dir":"{{.TRACE_DIR}}","torch_profiler_with_stack":{{.WITH_STACK}},"max_iterations":1}' \ 2>&1 | tee {{.LOG_FILE}} serve:plain: @@ -90,6 +95,31 @@ tasks: # ── Traces ──────────────────────────────────────────────────────────── + traces:pull: + desc: "Copy latest traces from pod → LOCAL_TRACE_DIR (default ./traces)" + vars: + POD: '{{.POD | default "vllm-dev"}}' + cmds: + - mkdir -p {{.LOCAL_TRACE_DIR}} + - | + frontend=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/*.async_llm.*.json.gz 2>/dev/null | head -1") + gpu=$(oc exec {{.POD}} -- bash -c "ls -t {{.TRACE_DIR}}/rank0.*.json.gz 2>/dev/null | head -1") + [ -z "$frontend" ] && echo "No frontend trace found" && exit 1 + [ -z "$gpu" ] && echo "No GPU trace found" && exit 1 + oc cp {{.POD}}:$frontend {{.LOCAL_TRACE_DIR}}/frontend.json.gz + oc cp {{.POD}}:$gpu {{.LOCAL_TRACE_DIR}}/rank0.json.gz + echo "Pulled to {{.LOCAL_TRACE_DIR}}/" + ls -lh {{.LOCAL_TRACE_DIR}}/*.json.gz + + traces:summary: + desc: "Build summary.json.gz from frontend + GPU traces (LOCAL_TRACE_DIR= to override)" + cmds: + - | + {{.PYTHON}} tools/summary_trace.py \ + {{.LOCAL_TRACE_DIR}}/frontend.json.gz \ + {{.LOCAL_TRACE_DIR}}/rank0.json.gz \ + {{.LOCAL_TRACE_DIR}}/summary.json.gz + traces:ls: desc: List trace files in TRACE_DIR, split by frontend vs GPU worker cmds: @@ -143,6 +173,137 @@ tasks: - rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.json - echo "Cleaned {{.TRACE_DIR}}" + # ── Experiments ─────────────────────────────────────────────────────── + # Each experiment restarts the server to ensure a cold cache, then captures + # one profiled request and saves traces to traces/expN/. + # + # Experiments: + # 1. small image — URL + # 2. hi-res image — URL + # 3. small image — base64 + # 4. hi-res image — base64 + + _exp:restart-server: + internal: true + desc: Kill + restart vllm in pod with profiling, wait until ready + vars: + POD: '{{.POD | default "vllm-dev"}}' + cmds: + - | + oc exec {{.POD}} -- bash -c " + kill \$(pgrep -f 'vllm serve') 2>/dev/null || true; sleep 3 + rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.txt + nohup env VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \ + HF_TOKEN='$HF_TOKEN' HUGGING_FACE_HUB_TOKEN='$HF_TOKEN' \ + /workspace/vllm/.venv/bin/vllm serve {{.MODEL}} \ + --port {{.PORT}} \ + --profiler-config '{\"profiler\":\"torch\",\"torch_profiler_dir\":\"{{.TRACE_DIR}}\",\"torch_profiler_with_stack\":{{.WITH_STACK}},\"max_iterations\":1}' \ + > /tmp/vllm-serve.log 2>&1 & + echo \"Server PID \$!\" + until curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do + sleep 5 + done + echo ready" + + _exp:capture: + internal: true + desc: "Profile one URL-based request. Vars: IMAGE_URL, OUTDIR" + vars: + POD: '{{.POD | default "vllm-dev"}}' + cmds: + - | + oc exec {{.POD}} -- bash -c " + curl -fsS -X POST http://localhost:{{.PORT}}/start_profile + curl -sf http://localhost:{{.PORT}}/v1/chat/completions \ + -H 'Content-Type: application/json' \ + -d '{\"model\":\"{{.MODEL}}\",\"messages\":[{\"role\":\"user\",\"content\":[{\"type\":\"image_url\",\"image_url\":{\"url\":\"{{.IMAGE_URL}}\"}},{\"type\":\"text\",\"text\":\"Describe this image in one sentence.\"}]}],\"max_tokens\":64}' \ + > /dev/null + curl -fsS -X POST http://localhost:{{.PORT}}/stop_profile" + - task: traces:pull + vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" } + - task: traces:summary + vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" } + + _exp:capture-base64: + internal: true + # Download and encode the image on the pod so the payload never crosses the + # local→pod boundary. Large hi-res images (several MB base64) would exceed + # shell ARG_MAX if constructed locally and passed via oc exec bash -c "...". + desc: "Profile one base64 request. Vars: IMAGE_URL, OUTDIR" + vars: + POD: '{{.POD | default "vllm-dev"}}' + cmds: + - | + oc exec -i {{.POD}} -- python3 - <<'PYEOF' +import base64, json, urllib.request as R + +base = "http://localhost:{{.PORT}}" +url = "{{.IMAGE_URL}}" +model = "{{.MODEL}}" + +b64 = base64.b64encode(R.urlopen(url).read()).decode() +body = json.dumps({ + "model": model, + "messages": [{"role": "user", "content": [ + {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + b64}}, + {"type": "text", "text": "Describe this image in one sentence."}, + ]}], + "max_tokens": 64, +}).encode() + +R.urlopen(R.Request(f"{base}/start_profile", method="POST")) +R.urlopen(R.Request(f"{base}/v1/chat/completions", body, {"Content-Type": "application/json"})) +R.urlopen(R.Request(f"{base}/stop_profile", method="POST")) +PYEOF + - task: traces:pull + vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" } + - task: traces:summary + vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" } + + exp:1: + desc: "Experiment 1 — small image, URL delivery" + cmds: + - task: _exp:restart-server + - task: _exp:capture + vars: + IMAGE_URL: "{{.IMAGE_URL}}" + OUTDIR: ./traces/exp1_small_url + + exp:2: + desc: "Experiment 2 — hi-res image, URL delivery" + cmds: + - task: _exp:restart-server + - task: _exp:capture + vars: + IMAGE_URL: "{{.IMAGE_URL_HIRES}}" + OUTDIR: ./traces/exp2_hires_url + + exp:3: + desc: "Experiment 3 — small image, base64 delivery" + cmds: + - task: _exp:restart-server + - task: _exp:capture-base64 + vars: + IMAGE_URL: "{{.IMAGE_URL}}" + OUTDIR: ./traces/exp3_small_base64 + + exp:4: + desc: "Experiment 4 — hi-res image, base64 delivery" + cmds: + - task: _exp:restart-server + - task: _exp:capture-base64 + vars: + IMAGE_URL: "{{.IMAGE_URL_HIRES}}" + OUTDIR: ./traces/exp4_hires_base64 + + exp:all: + desc: "Run all 4 experiments sequentially (server restart between each)" + cmds: + - task: exp:1 + - task: exp:2 + - task: exp:3 + - task: exp:4 + # ── Logs ────────────────────────────────────────────────────────────── logs: diff --git a/tools/summary_trace.py b/tools/summary_trace.py new file mode 100644 index 000000000000..698584107d40 --- /dev/null +++ b/tools/summary_trace.py @@ -0,0 +1,119 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Build a Perfetto summary trace from vLLM frontend + GPU worker traces. + +Extracts the key VLM pipeline spans and aligns them by wall-clock time. +Both traces use CLOCK_MONOTONIC microseconds (same machine), so timestamps +are directly comparable — no clock conversion needed. + +Output tracks: + Frontend — media: url_download, media: pil_decode, mm_processor: process_multimodal + GPU Worker — mm_encoder: forward, gpu_model_runner: forward + +Usage: + python tools/summary_trace.py \\ + traces/frontend.json.gz traces/rank0.json.gz traces/summary.json.gz +""" + +import argparse +import gzip +import json +from pathlib import Path + +FRONTEND_SPANS = { + "media: url_download", + "media: pil_decode", + "mm_processor: process_multimodal", +} + +GPU_SPANS = { + "mm_encoder: forward", + "gpu_model_runner: forward", +} + +TID_FRONTEND = 1 +TID_GPU = 2 + + +def load_spans(path: Path, names: set) -> list: + with gzip.open(path) as f: + data = json.load(f) + return [ + e for e in data["traceEvents"] if e.get("ph") == "X" and e.get("name") in names + ] + + +def main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, formatter_class=argparse.RawDescriptionHelpFormatter + ) + parser.add_argument( + "frontend", type=Path, help="Frontend trace (async_llm.*.json.gz)" + ) + parser.add_argument("gpu", type=Path, help="GPU worker trace (rank0.*.json.gz)") + parser.add_argument("output", type=Path, help="Output summary trace (.json.gz)") + args = parser.parse_args() + + frontend_spans = load_spans(args.frontend, FRONTEND_SPANS) + gpu_spans = load_spans(args.gpu, GPU_SPANS) + + all_spans = frontend_spans + gpu_spans + if not all_spans: + raise SystemExit( + "No matching spans found — check that VLLM_CUSTOM_SCOPES_FOR_PROFILING=1" + ) + + # Normalize timestamps: t=0 is the start of the earliest span. + t0 = min(e["ts"] for e in all_spans) + + events = [] + for e in all_spans: + tid = TID_GPU if e["name"] in GPU_SPANS else TID_FRONTEND + events.append( + { + "ph": "X", + "cat": "user_annotation", + "name": e["name"], + "pid": 1, + "tid": tid, + "ts": e["ts"] - t0, + "dur": e["dur"], + } + ) + + events += [ + { + "ph": "M", + "pid": 1, + "tid": TID_FRONTEND, + "name": "thread_name", + "args": {"name": "Frontend"}, + }, + { + "ph": "M", + "pid": 1, + "tid": TID_GPU, + "name": "thread_name", + "args": {"name": "GPU Worker"}, + }, + ] + + args.output.parent.mkdir(parents=True, exist_ok=True) + with gzip.open(args.output, "wt") as f: + json.dump({"traceEvents": events, "displayTimeUnit": "ms"}, f) + + # Print summary table + span_events = sorted( + (e for e in events if e.get("ph") == "X"), key=lambda e: e["ts"] + ) + print(f"\n{'Span':<45} {'start':>9} {'dur':>9}") + print("-" * 67) + for e in span_events: + print(f"{e['name']:<45} {e['ts'] / 1000:>8.1f}ms {e['dur'] / 1000:>8.1f}ms") + print(f"\nWritten: {args.output}") + + +if __name__ == "__main__": + main() From 272304d61d6e1e03ab3f968c74de2aaf717901ac Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 17:56:23 +0300 Subject: [PATCH 7/9] profiler: fix stop_profile ordering and add data: URL span Signed-off-by: Sage Ahrac --- vllm/multimodal/media/connector.py | 21 ++++++++++----------- vllm/v1/engine/async_llm.py | 9 ++++++++- 2 files changed, 18 insertions(+), 12 deletions(-) diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py index 4ce9532e4e64..f09019923f17 100644 --- a/vllm/multimodal/media/connector.py +++ b/vllm/multimodal/media/connector.py @@ -301,12 +301,11 @@ def load_from_url( return media_io.load_bytes(cached) connection = self.connection - with record_function_or_nullcontext("media: url_download"): - data = connection.get_bytes( - url_spec.url, - timeout=fetch_timeout, - allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS, - ) + data = connection.get_bytes( + url_spec.url, + timeout=fetch_timeout, + allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS, + ) self._put_cached_bytes(url, data) return media_io.load_bytes(data) @@ -359,13 +358,13 @@ async def load_from_url_async( ) if url_spec.scheme == "data": - future: asyncio.Future[_M] = loop.run_in_executor( - global_thread_pool, self._load_data_url, url_spec, media_io - ) - return await future + with record_function_or_nullcontext("media: data_decode"): + return await loop.run_in_executor( + global_thread_pool, self._load_data_url, url_spec, media_io + ) if url_spec.scheme == "file": - future = loop.run_in_executor( + future: asyncio.Future[_M] = loop.run_in_executor( global_thread_pool, self._load_file_url, url_spec, media_io ) return await future diff --git a/vllm/v1/engine/async_llm.py b/vllm/v1/engine/async_llm.py index 6f7b79c87d29..0ab4d0147365 100644 --- a/vllm/v1/engine/async_llm.py +++ b/vllm/v1/engine/async_llm.py @@ -881,9 +881,16 @@ async def start_profile(self, profile_prefix: str | None = None) -> None: await self.engine_core.profile_async(True, profile_prefix) async def stop_profile(self) -> None: - await self.engine_core.profile_async(False) + # Stop the frontend profiler first (same thread as start(), required by + # kineto), then tell the engine. Doing it the other way around would + # leave the frontend profiler open while awaiting the engine roundtrip, + # capturing idle asyncio noise after the request is already done. + # profiler.stop() calls on_trace_ready which writes the trace file — + # this is blocking I/O on the event loop, but stop_profile is a debug + # path so the brief pause is acceptable. if self.profiler is not None: self.profiler.stop() + await self.engine_core.profile_async(False) async def reset_mm_cache(self) -> None: await self.renderer.clear_mm_cache_async() From f44933f6e20e5baf202f14f23771f5c27fbb63f9 Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 18:06:28 +0300 Subject: [PATCH 8/9] profiler: fix base64 capture task and add capture_request.py Signed-off-by: Sage Ahrac --- Taskfile.yml | 38 ++++------ tools/capture_request.py | 148 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 162 insertions(+), 24 deletions(-) create mode 100644 tools/capture_request.py diff --git a/Taskfile.yml b/Taskfile.yml index e1dfefd2019f..bd5a9c75d079 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -226,35 +226,25 @@ tasks: _exp:capture-base64: internal: true - # Download and encode the image on the pod so the payload never crosses the - # local→pod boundary. Large hi-res images (several MB base64) would exceed - # shell ARG_MAX if constructed locally and passed via oc exec bash -c "...". + # Use oc port-forward so capture_request.py runs locally with --base64, + # downloading the image locally and sending the encoded payload to the pod. + # This avoids heredoc-in-YAML issues and keeps Python code out of the Taskfile. desc: "Profile one base64 request. Vars: IMAGE_URL, OUTDIR" vars: POD: '{{.POD | default "vllm-dev"}}' + LOCAL_PORT: "18000" cmds: - | - oc exec -i {{.POD}} -- python3 - <<'PYEOF' -import base64, json, urllib.request as R - -base = "http://localhost:{{.PORT}}" -url = "{{.IMAGE_URL}}" -model = "{{.MODEL}}" - -b64 = base64.b64encode(R.urlopen(url).read()).decode() -body = json.dumps({ - "model": model, - "messages": [{"role": "user", "content": [ - {"type": "image_url", "image_url": {"url": "data:image/jpeg;base64," + b64}}, - {"type": "text", "text": "Describe this image in one sentence."}, - ]}], - "max_tokens": 64, -}).encode() - -R.urlopen(R.Request(f"{base}/start_profile", method="POST")) -R.urlopen(R.Request(f"{base}/v1/chat/completions", body, {"Content-Type": "application/json"})) -R.urlopen(R.Request(f"{base}/stop_profile", method="POST")) -PYEOF + oc port-forward {{.POD}} {{.LOCAL_PORT}}:{{.PORT}} & + PFW=$! + trap "kill $PFW 2>/dev/null" EXIT + sleep 2 + python3 tools/capture_request.py \ + --endpoint http://localhost:{{.LOCAL_PORT}} \ + --model {{.MODEL}} \ + --image {{.IMAGE_URL}} \ + --base64 + kill $PFW 2>/dev/null || true - task: traces:pull vars: { LOCAL_TRACE_DIR: "{{.OUTDIR}}" } - task: traces:summary diff --git a/tools/capture_request.py b/tools/capture_request.py new file mode 100644 index 000000000000..ae6fc7e73524 --- /dev/null +++ b/tools/capture_request.py @@ -0,0 +1,148 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Send one profiled VLM request to a running vLLM server. + +Wraps the request with /start_profile and /stop_profile so the trace +contains exactly one request. Supports URL and base64 image delivery. + +Usage: + # URL mode (server fetches the image): + python tools/capture_request.py \\ + --endpoint http://localhost:8000 \\ + --model google/gemma-3-27b-it \\ + --image https://example.com/cat.jpg + + # base64 mode (image encoded and embedded in the request body): + python tools/capture_request.py \\ + --endpoint http://localhost:8000 \\ + --model google/gemma-3-27b-it \\ + --image https://example.com/cat.jpg \\ + --base64 +""" + +import argparse +import json +import sys +import urllib.request as R +from urllib.error import URLError + +import pybase64 as base64 + + +def _post( + url: str, body: bytes | None = None, content_type: str | None = None +) -> bytes: + headers = {} + if content_type: + headers["Content-Type"] = content_type + req = R.Request( + url, data=body, headers=headers, method="POST" if body is None else None + ) + req.method = "POST" + try: + return R.urlopen(req).read() + except URLError as e: + print(f"ERROR: {url} → {e}", file=sys.stderr) + raise + + +def build_image_content(image_url: str, base64_mode: bool) -> dict: + if not base64_mode: + return {"type": "image_url", "image_url": {"url": image_url}} + + print(f"Downloading {image_url} for base64 encoding...", file=sys.stderr) + img_bytes = R.urlopen(image_url).read() + + # Infer MIME type from URL extension; default to jpeg. + lower = image_url.lower() + if lower.endswith(".png"): + mime = "image/png" + elif lower.endswith(".webp"): + mime = "image/webp" + elif lower.endswith(".gif"): + mime = "image/gif" + else: + mime = "image/jpeg" + + b64 = base64.b64encode(img_bytes).decode() + print( + f"Encoded {len(img_bytes):,} bytes → {len(b64):,} base64 chars mime={mime}", + file=sys.stderr, + ) + return {"type": "image_url", "image_url": {"url": f"data:{mime};base64,{b64}"}} + + +def main() -> None: + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.RawDescriptionHelpFormatter, + ) + parser.add_argument( + "--endpoint", + default="http://localhost:8000", + help="vLLM server base URL (default: http://localhost:8000)", + ) + parser.add_argument("--model", required=True, help="Model name as served by vLLM") + parser.add_argument( + "--image", + required=True, + metavar="URL", + help="Image URL to send (fetched from server for URL mode, " + "or downloaded locally and base64-encoded for --base64)", + ) + parser.add_argument( + "--base64", + action="store_true", + help="Download image locally and embed as base64 data URL", + ) + parser.add_argument( + "--prompt", + default="Describe this image in one sentence.", + help="Text prompt to send alongside the image", + ) + parser.add_argument("--max-tokens", type=int, default=64) + args = parser.parse_args() + + base = args.endpoint.rstrip("/") + image_content = build_image_content(args.image, args.base64) + + payload = json.dumps( + { + "model": args.model, + "messages": [ + { + "role": "user", + "content": [ + image_content, + {"type": "text", "text": args.prompt}, + ], + } + ], + "max_tokens": args.max_tokens, + } + ).encode() + + print("Starting profiler...", file=sys.stderr) + _post(f"{base}/start_profile") + + print("Sending request...", file=sys.stderr) + resp_bytes = _post(f"{base}/v1/chat/completions", payload, "application/json") + result = json.loads(resp_bytes) + content = result["choices"][0]["message"]["content"] + usage = result.get("usage", {}) + prompt_t = usage.get("prompt_tokens", 0) + compl_t = usage.get("completion_tokens", 0) + print( + f"Response ({prompt_t}→{compl_t} tokens): {content[:120]}", + file=sys.stderr, + ) + + print("Stopping profiler...", file=sys.stderr) + _post(f"{base}/stop_profile") + print("Done.", file=sys.stderr) + + +if __name__ == "__main__": + main() From 2af7c78a7dcd424d84d6c3933ae443fd0e982bae Mon Sep 17 00:00:00 2001 From: Sage Ahrac Date: Mon, 13 Apr 2026 18:48:02 +0300 Subject: [PATCH 9/9] profiler: fix kill pattern, add data_decode span, fix image URLs Signed-off-by: Sage Ahrac --- Taskfile.yml | 23 +++++++++++++++-------- tools/capture_request.py | 5 +---- tools/summary_trace.py | 6 ++++-- 3 files changed, 20 insertions(+), 14 deletions(-) diff --git a/Taskfile.yml b/Taskfile.yml index bd5a9c75d079..bb30410b9179 100644 --- a/Taskfile.yml +++ b/Taskfile.yml @@ -13,7 +13,7 @@ vars: # Stable public image used for ad-hoc profiling requests IMAGE_URL: '{{.IMAGE_URL | default "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"}}' # High-res public domain image (~8 MP) - IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://upload.wikimedia.org/wikipedia/commons/a/a7/Camponotus_flavomarginatus_ant.jpg"}}' + IMAGE_URL_HIRES: '{{.IMAGE_URL_HIRES | default "https://picsum.photos/seed/vllm/3000/2000.jpg"}}' tasks: default: @@ -189,9 +189,13 @@ tasks: vars: POD: '{{.POD | default "vllm-dev"}}' cmds: + # Step 1a: kill old server. + # Run pkill directly (no bash -c wrapper) so the search pattern does not + # appear in any wrapper process's cmdline. pkill always excludes itself. + - oc exec {{.POD}} -- pkill -f /workspace/vllm/.venv/bin/vllm || true + # Step 1b: wipe old traces and launch new server (no sleep, no wait) - | oc exec {{.POD}} -- bash -c " - kill \$(pgrep -f 'vllm serve') 2>/dev/null || true; sleep 3 rm -f {{.TRACE_DIR}}/*.json.gz {{.TRACE_DIR}}/*.txt nohup env VLLM_CUSTOM_SCOPES_FOR_PROFILING=1 \ HF_TOKEN='$HF_TOKEN' HUGGING_FACE_HUB_TOKEN='$HF_TOKEN' \ @@ -199,11 +203,14 @@ tasks: --port {{.PORT}} \ --profiler-config '{\"profiler\":\"torch\",\"torch_profiler_dir\":\"{{.TRACE_DIR}}\",\"torch_profiler_with_stack\":{{.WITH_STACK}},\"max_iterations\":1}' \ > /tmp/vllm-serve.log 2>&1 & - echo \"Server PID \$!\" - until curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do - sleep 5 - done - echo ready" + echo \"Server PID \$!\"" + # Step 2: poll locally so oc exec never sits open for 2+ minutes + - | + echo "Waiting for server on {{.POD}}:{{.PORT}}..." + until oc exec {{.POD}} -- curl -sf http://localhost:{{.PORT}}/v1/models > /dev/null 2>&1; do + sleep 10 + done + echo ready _exp:capture: internal: true @@ -239,7 +246,7 @@ tasks: PFW=$! trap "kill $PFW 2>/dev/null" EXIT sleep 2 - python3 tools/capture_request.py \ + {{.PYTHON}} tools/capture_request.py \ --endpoint http://localhost:{{.LOCAL_PORT}} \ --model {{.MODEL}} \ --image {{.IMAGE_URL}} \ diff --git a/tools/capture_request.py b/tools/capture_request.py index ae6fc7e73524..ac61e6f4be7a 100644 --- a/tools/capture_request.py +++ b/tools/capture_request.py @@ -37,10 +37,7 @@ def _post( headers = {} if content_type: headers["Content-Type"] = content_type - req = R.Request( - url, data=body, headers=headers, method="POST" if body is None else None - ) - req.method = "POST" + req = R.Request(url, data=body, headers=headers, method="POST") try: return R.urlopen(req).read() except URLError as e: diff --git a/tools/summary_trace.py b/tools/summary_trace.py index 698584107d40..6f92835da961 100644 --- a/tools/summary_trace.py +++ b/tools/summary_trace.py @@ -9,7 +9,8 @@ are directly comparable — no clock conversion needed. Output tracks: - Frontend — media: url_download, media: pil_decode, mm_processor: process_multimodal + Frontend — media: url_download / media: data_decode, + media: pil_decode, mm_processor: process_multimodal GPU Worker — mm_encoder: forward, gpu_model_runner: forward Usage: @@ -23,7 +24,8 @@ from pathlib import Path FRONTEND_SPANS = { - "media: url_download", + "media: url_download", # HTTP URL path + "media: data_decode", # base64 / data: URL path "media: pil_decode", "mm_processor: process_multimodal", }