From ed7d980b95a5d78fde283d95934ca60928bd71bd Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 23 Jun 2026 15:48:30 +0200
Subject: [PATCH 1/6] Added TRT-LLM notebook

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 README.md                                     |   5 +-
 cookbooks/cosmos3/README.md                   |  54 +-
 .../cosmos3/generator/audiovisual/README.md   |  61 +-
 .../audiovisual/run_with_trt_llm.ipynb        | 807 ++++++++++++++++++
 4 files changed, 921 insertions(+), 6 deletions(-)
 create mode 100644 cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
diff --git a/README.md b/README.md
index 1baad1d9..6d07884c 100644
--- a/README.md
+++ b/README.md
@@ -209,8 +209,10 @@ Set `HF_HOME` if you want to use a shared cache or a disk with more space.
 Generator requires the Guardrail. Request access to the gated
 [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail)
 HF repository. To disable the guardrail, set `enable_safety_checker=False` (Diffusers),
-`guardrails: false` (vLLM-Omni `extra_params`/`extra_args`), or
+`TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` or `use_guardrails: false`
+(TensorRT-LLM), `guardrails: false` (vLLM-Omni `extra_params`/`extra_args`), or
 `--no-guardrails` (Cosmos Framework).
+
 #### Generator with Diffusers
 
 <details>
@@ -745,6 +747,7 @@ We are building examples that show Cosmos 3 capabilities end to end, including w
 | Generator (audiovisual) with Diffusers | Generator | Text-to-image, plus text-to-video and image-to-video each with or without synchronized sound, via `Cosmos3OmniPipeline`. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_diffusers.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_diffusers.ipynb) |
 | Generator (audiovisual) with Cosmos Framework | Generator | Text-to-image, plus text-to-video and image-to-video each with sound on or off, through the `cosmos_framework.scripts.inference` entrypoint. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_cosmos_framework.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_cosmos_framework.ipynb) |
 | Generator (audiovisual) with vLLM-Omni | Generator | Text-to-image, plus text-to-video and image-to-video each with sound on or off, against an OpenAI-compatible vLLM-Omni server. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_vllm_omni.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_vllm_omni.ipynb) |
+| Generator (audiovisual) with TensorRT-LLM | Generator | Text-to-video and image-to-video against an OpenAI-compatible TensorRT-LLM VisualGen server. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb) |
 | Forward dynamics with Cosmos Framework | Generator | Forward dynamics: action-conditioned future-observation prediction for AV, DROID, and UMI, through the `cosmos_framework.scripts.inference` entrypoint. | [Notebook](cookbooks/cosmos3/generator/action/run_fd_with_cosmos_framework.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/action/run_fd_with_cosmos_framework.ipynb) |
 | Forward dynamics with vLLM-Omni | Generator | Forward dynamics: action-conditioned future-observation prediction for AV, DROID, and UMI, against an OpenAI-compatible vLLM-Omni server. | [Notebook](cookbooks/cosmos3/generator/action/run_fd_with_vllm.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/action/run_fd_with_vllm.ipynb) |
 | Inverse dynamics with Cosmos Framework | Generator | Inverse dynamics: ego-motion trajectory prediction from input AV video, through the `cosmos_framework.scripts.inference` entrypoint. | [Notebook](cookbooks/cosmos3/generator/action/run_id_with_cosmos_framework.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/action/run_id_with_cosmos_framework.ipynb) |
diff --git a/cookbooks/cosmos3/README.md b/cookbooks/cosmos3/README.md
index 4c1f9fdc..0d6fa54d 100644
--- a/cookbooks/cosmos3/README.md
+++ b/cookbooks/cosmos3/README.md
@@ -8,6 +8,7 @@ backend you want to run and follow that one section.
 | --- | --- | --- |
 | [Cosmos Framework](#cosmos-framework) | Native PyTorch inference, launched with `torchrun` | Reasoner, Generator (Audiovisual, Action, **Transfer**) |
 | [Diffusers](#diffusers) | Direct generation with `Cosmos3OmniPipeline` | Generator (Audiovisual) |
+| [TensorRT-LLM](#tensorrt-llm) | OpenAI-compatible VisualGen server (image/video generation) | Generator (Audiovisual) |
 | [Transformers](#transformers) | Hugging Face Transformers inference | Reasoner |
 | [vLLM](#vllm) | OpenAI-compatible reasoning server (image/video understanding) | Reasoner |
 | [vLLM-Omni](#vllm-omni) | OpenAI-compatible generation server (image/video/audio/action) | Generator (Audiovisual, Action) |
@@ -28,9 +29,10 @@ backend you want to run and follow that one section.
   export HF_TOKEN=<your_token>
   ```
 
-  To disable the guardrail, set `enable_safety_checker=False` (Diffusers), `guardrails: false`
-  (vLLM-Omni `extra_params`/`extra_args`), or
-  `--no-guardrails` (Cosmos Framework).
+  To disable the guardrail, set `enable_safety_checker=False` (Diffusers),
+  `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` or `use_guardrails: false`
+  (TensorRT-LLM), `guardrails: false` (vLLM-Omni
+  `extra_params`/`extra_args`), or `--no-guardrails` (Cosmos Framework).
 - For the Cosmos Framework backend: access to `git@github.com:NVIDIA/cosmos-framework.git`.
 - For the NIM backend: an NGC API key (used as `NGC_API_KEY`), which you can generate on [build.nvidia.com](https://build.nvidia.com/nvidia/cosmos3-nano-reasoner) or [NGC](https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/cosmos3-reasoner), plus a one-time `docker login nvcr.io` (username `$oauthtoken`, password = your key). The HF login above is not needed for NIM.
 - Enough local disk for the venv/image, the uv cache, and the model cache. Nano
@@ -161,6 +163,52 @@ uv pip install --torch-backend=cu130 \
   transformers
 ```
 
+## TensorRT-LLM
+
+OpenAI-compatible **VisualGen** server for Generator audiovisual text-to-video
+and image-to-video examples. Cosmos3 support was added in TensorRT-LLM PR
+[#14824](https://github.com/NVIDIA/TensorRT-LLM/pull/14824); use a
+TensorRT-LLM checkout or package that includes that change.
+
+Install TensorRT-LLM following its upstream documentation, then install the
+Cosmos3 guardrail package in the same environment unless you explicitly disable
+guardrails before starting the server:
+
+```bash
+pip install cosmos_guardrail==0.3.0
+# If needed by your OpenCV stack:
+# pip uninstall opencv-python
+```
+
+Set the TensorRT-LLM source root for the shared VisualGen config YAMLs:
+
+```bash
+export TRTLLM_ROOT="${TRTLLM_ROOT:-$PWD/TensorRT-LLM}"
+export COSMOS3_TRTLLM_PORT="${COSMOS3_TRTLLM_PORT:-8000}"
+```
+
+**Cosmos3-Nano** (single GPU):
+
+```bash
+trtllm-serve nvidia/Cosmos3-Nano \
+  --visual_gen_args "$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-nano-1gpu.yaml" \
+  --port "$COSMOS3_TRTLLM_PORT"
+```
+
+**Cosmos3-Super** (four GPUs; CFG parallelism with Ulysses, plus parallel VAE):
+
+```bash
+torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve \
+  nvidia/Cosmos3-Super \
+  --visual_gen_args "$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-super-4gpu.yaml" \
+  --port "$COSMOS3_TRTLLM_PORT"
+```
+
+The server exposes `/health`, `/v1/videos/generations`, `/v1/videos`, and
+`/v1/images/generations`. The audiovisual notebook uses the validated video
+generation endpoint for text-to-video and image-to-video. If `ffmpeg` is
+available, `format="auto"` returns MP4; otherwise TensorRT-LLM falls back to AVI.
+
 ## Transformers
 
 Local Python inference for the Cosmos3 Reasoner. This backend uses the
diff --git a/cookbooks/cosmos3/generator/audiovisual/README.md b/cookbooks/cosmos3/generator/audiovisual/README.md
index d80adad4..1fed17b8 100644
--- a/cookbooks/cosmos3/generator/audiovisual/README.md
+++ b/cookbooks/cosmos3/generator/audiovisual/README.md
@@ -1,7 +1,7 @@
 # Cosmos3 Generator Audiovisual Examples
 
 Generate images and video (with optional audio) from text or image prompts with
-`Cosmos3-Nano` and `Cosmos3-Super`, across three inference backends. Sample
+`Cosmos3-Nano` and `Cosmos3-Super`, across four inference backends. Sample
 prompts live under [`assets/`](./assets).
 
 Environment setup for every backend is centralized in the shared
@@ -12,7 +12,8 @@ to get one generation running per backend — run them from this folder.
 Generator requires the Guardrail. Request access to the gated
 [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail)
 HF repository before running these examples. To disable the guardrail, set
-`enable_safety_checker=False` (Diffusers), `guardrails: false` (vLLM-Omni
+`enable_safety_checker=False` (Diffusers), `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1`
+or `use_guardrails: false` (TensorRT-LLM), `guardrails: false` (vLLM-Omni
 `extra_params`/`extra_args`), or `--no-guardrails` (Cosmos Framework).
 
 ## Run with Cosmos Framework
@@ -184,3 +185,59 @@ the vLLM-Omni backend: it walks through text-to-image, text-to-video, and
 image-to-video requests with audio on or off. Server launch options (Nano and
 Super, tensor parallelism, layerwise offload, and CFG-parallel variants) live in
 the [shared environment setup guide](../../README.md#vllm-omni).
+
+## Run with TensorRT-LLM
+
+### Quickstart
+
+Set up the environment and start the server:
+[TensorRT-LLM setup](../../README.md#tensorrt-llm). The notebook targets the
+OpenAI-compatible VisualGen API served by `trtllm-serve`.
+
+Send a text-to-video request with the synchronous video API:
+
+```python
+import json
+from pathlib import Path
+
+import requests
+
+prompt = json.load(open("assets/prompts/text2video/robot_kitchen.json"))
+negative = json.load(open("assets/negative_prompts/text2video/neg_prompt.json"))
+
+response = requests.post(
+    "http://localhost:8000/v1/videos/generations",
+    json={
+        "prompt": json.dumps(prompt, ensure_ascii=True, separators=(",", ":")),
+        "negative_prompt": json.dumps(negative, ensure_ascii=True, separators=(",", ":")),
+        "size": "1280x720",
+        "num_frames": 189,
+        "fps": 24,
+        "num_inference_steps": 35,
+        "guidance_scale": 6.0,
+        "seed": 0,
+        "format": "auto",
+        "extra_params": {
+            "use_resolution_template": False,
+            "use_duration_template": False,
+            "use_system_prompt": False,
+            "use_guardrails": True,
+        },
+    },
+)
+response.raise_for_status()
+suffix = ".avi" if "x-msvideo" in response.headers.get("content-type", "") else ".mp4"
+Path(f"/tmp/cosmos3_t2v_trtllm{suffix}").write_bytes(response.content)
+```
+
+For image-to-video, post multipart form data to the same endpoint with the
+reference image under `input_reference`. TensorRT-LLM Cosmos3 audio/action
+generation is not covered by this backend section.
+
+### Notebook walkthrough
+
+[`run_with_trt_llm.ipynb`](./run_with_trt_llm.ipynb) is the full tutorial for the
+TensorRT-LLM backend: it walks through text-to-video and image-to-video requests
+against an already-running VisualGen server. Server launch options (Nano and
+Super, FP8 dynamic quantization, CFG parallelism, Ulysses, and parallel VAE)
+live in the [shared environment setup guide](../../README.md#tensorrt-llm).
diff --git a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
new file mode 100644
index 00000000..3668f7f6
--- /dev/null
+++ b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
@@ -0,0 +1,807 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "<!-- SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.\n",
+    "SPDX-License-Identifier: OpenMDW-1.1 -->"
+   ],
+   "id": "license-header"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cosmos3 Generator Audiovisual with TensorRT-LLM\n",
+    "\n",
+    "This notebook calls already-running TensorRT-LLM VisualGen servers with direct `curl` requests from Python.\n",
+    "\n",
+    "The examples are split into Cosmos3-Nano and Cosmos3-Super sections. Each section is self-contained, so you can run just one. The notebook covers TensorRT-LLM's stable server flow for text-to-video and image-to-video generation.\n"
+   ],
+   "id": "title"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Prerequisites\n",
+    "\n",
+    "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Video generation uses `/v1/videos/generations`.\n",
+    "\n",
+    "Generator requires the Guardrail. Request access to the gated [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) HF repository before running these examples. TensorRT-LLM loads guardrails by default; to disable them, set `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` before starting the server or set `use_guardrails` to `False` in the request `extra_params`.\n",
+    "\n",
+    "```bash\n",
+    "export COSMOS3_TRTLLM_BASE_URL=http://localhost:8000\n",
+    "export COSMOS3_TRTLLM_NANO_BASE_URL=http://localhost:8000\n",
+    "export COSMOS3_TRTLLM_SUPER_BASE_URL=http://localhost:8000\n",
+    "export COSMOS3_TRTLLM_API_KEY=tensorrt_llm\n",
+    "```\n"
+   ],
+   "id": "prerequisites"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Start the Server\n",
+    "\n",
+    "Run the TensorRT-LLM VisualGen server before running the request cells. The config YAMLs below come from TensorRT-LLM's Cosmos3 support.\n",
+    "\n",
+    "### Cosmos3-Nano\n",
+    "\n",
+    "From the repository root, with TensorRT-LLM installed in the active environment:\n",
+    "\n",
+    "```bash\n",
+    "export TRTLLM_ROOT=\"${TRTLLM_ROOT:-$PWD/TensorRT-LLM}\"\n",
+    "\n",
+    "trtllm-serve nvidia/Cosmos3-Nano   --visual_gen_args \"$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-nano-1gpu.yaml\"   --port 8000\n",
+    "```\n",
+    "\n",
+    "### Cosmos3-Super\n",
+    "\n",
+    "`Cosmos3-Super` uses the four-GPU config from TensorRT-LLM. The config sets `cfg_size=2`, `ulysses_size=2`, and `parallel_vae_size=4`, so launch exactly four processes.\n",
+    "\n",
+    "```bash\n",
+    "export TRTLLM_ROOT=\"${TRTLLM_ROOT:-$PWD/TensorRT-LLM}\"\n",
+    "\n",
+    "torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve   nvidia/Cosmos3-Super   --visual_gen_args \"$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-super-4gpu.yaml\"   --port 8000\n",
+    "```\n",
+    "\n",
+    "TensorRT-LLM exposes `/health` when the server is ready. If `ffmpeg` is installed, `format=\"auto\"` returns MP4; otherwise TensorRT-LLM falls back to AVI.\n"
+   ],
+   "id": "start-server"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Configure Paths and Endpoints\n",
+    "\n",
+    "This setup cell only configures repo/output paths and TensorRT-LLM endpoint settings.\n"
+   ],
+   "id": "configure"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import os\n",
+    "\n",
+    "\n",
+    "def find_repo_root(start: Path) -> Path:\n",
+    "    for path in [start, *start.parents]:\n",
+    "        if (path / \"README.md\").exists() and (path / \"cookbooks\").exists():\n",
+    "            return path\n",
+    "    return start\n",
+    "\n",
+    "\n",
+    "COSMOS_ROOT = find_repo_root(Path.cwd().resolve())\n",
+    "COSMOS3_AUDIOVISUAL_ROOT = COSMOS_ROOT / \"cookbooks\" / \"cosmos3\" / \"generator\" / \"audiovisual\"\n",
+    "COSMOS3_AUDIOVISUAL_OUTPUT_ROOT = Path(\n",
+    "    os.environ.get(\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\", COSMOS3_AUDIOVISUAL_ROOT / \"outputs\" / \"notebooks\")\n",
+    ").resolve()\n",
+    "DEFAULT_TRTLLM_BASE_URL = os.environ.get(\"COSMOS3_TRTLLM_BASE_URL\", \"http://localhost:8000\")\n",
+    "TRTLLM_ENDPOINTS = {\n",
+    "    \"Cosmos3-Nano\": os.environ.get(\"COSMOS3_TRTLLM_NANO_BASE_URL\", DEFAULT_TRTLLM_BASE_URL),\n",
+    "    \"Cosmos3-Super\": os.environ.get(\"COSMOS3_TRTLLM_SUPER_BASE_URL\", DEFAULT_TRTLLM_BASE_URL),\n",
+    "}\n",
+    "\n",
+    "os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"] = str(COSMOS3_AUDIOVISUAL_OUTPUT_ROOT)\n",
+    "os.environ.setdefault(\"COSMOS3_TRTLLM_API_KEY\", \"tensorrt_llm\")\n",
+    "\n",
+    "print(\"COSMOS_ROOT:\", COSMOS_ROOT)\n",
+    "print(\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT:\", COSMOS3_AUDIOVISUAL_OUTPUT_ROOT)\n",
+    "for model, endpoint in TRTLLM_ENDPOINTS.items():\n",
+    "    print(f\"{model} endpoint: {endpoint}\")\n"
+   ],
+   "id": "setup-code"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 4. Verify Endpoint Configuration\n"
+   ],
+   "id": "verify"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from urllib.parse import urlparse\n",
+    "\n",
+    "\n",
+    "def api_root_url(base_url: str) -> str:\n",
+    "    normalized = base_url.rstrip(\"/\")\n",
+    "    if not normalized.endswith(\"/v1\"):\n",
+    "        normalized = f\"{normalized}/v1\"\n",
+    "    return normalized\n",
+    "\n",
+    "\n",
+    "def server_root_url(base_url: str) -> str:\n",
+    "    normalized = base_url.rstrip(\"/\")\n",
+    "    if normalized.endswith(\"/v1\"):\n",
+    "        normalized = normalized[:-3]\n",
+    "    return normalized.rstrip(\"/\")\n",
+    "\n",
+    "\n",
+    "def health_url(base_url: str) -> str:\n",
+    "    return f\"{server_root_url(base_url)}/health\"\n",
+    "\n",
+    "\n",
+    "def video_api_url(base_url: str) -> str:\n",
+    "    return f\"{api_root_url(base_url)}/videos/generations\"\n",
+    "\n",
+    "\n",
+    "for model, base_url in TRTLLM_ENDPOINTS.items():\n",
+    "    parsed = urlparse(api_root_url(base_url))\n",
+    "    print(model)\n",
+    "    print(\"  api root:\", api_root_url(base_url))\n",
+    "    print(\"  health:\", health_url(base_url))\n",
+    "    print(\"  videos generations:\", video_api_url(base_url))\n",
+    "    print(\"  scheme:\", parsed.scheme)\n",
+    "    print(\"  host:\", parsed.netloc)\n"
+   ],
+   "id": "endpoint-code"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 5. Preview Available Inputs\n"
+   ],
+   "id": "preview"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from pathlib import Path\n",
+    "import json\n",
+    "from IPython.display import Image, display\n",
+    "\n",
+    "assets_dir = COSMOS3_AUDIOVISUAL_ROOT / \"assets\"\n",
+    "for prompt_dir in sorted((assets_dir / \"prompts\").iterdir()):\n",
+    "    if not prompt_dir.is_dir():\n",
+    "        continue\n",
+    "    print(f\"{prompt_dir.relative_to(assets_dir)}:\")\n",
+    "    for prompt_path in sorted(prompt_dir.glob(\"*.json\")):\n",
+    "        data = json.loads(prompt_path.read_text())\n",
+    "        caption = (\n",
+    "            data.get(\"temporal_caption\")\n",
+    "            or data.get(\"comprehensive_t2i_caption\")\n",
+    "            or data.get(\"extra\", {}).get(\"prompt\", \"\")\n",
+    "        )\n",
+    "        print(f\"  {prompt_path.name}: {caption[:180]}{'...' if len(caption) > 180 else ''}\")\n",
+    "    print()\n",
+    "\n",
+    "for image_dir in sorted((assets_dir / \"images\").iterdir()):\n",
+    "    if not image_dir.is_dir():\n",
+    "        continue\n",
+    "    print(f\"{image_dir.relative_to(assets_dir)}:\")\n",
+    "    for image_path in sorted(image_dir.iterdir()):\n",
+    "        if image_path.suffix.lower() in {\".jpg\", \".jpeg\", \".png\", \".webp\", \".bmp\"}:\n",
+    "            print(f\"  {image_path.name}\")\n",
+    "            display(Image(filename=str(image_path), width=420))\n"
+   ],
+   "id": "preview-code"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 6. Define Asset Sets, Payload Helpers, Request Helpers, and Viewer Helpers\n"
+   ],
+   "id": "helpers"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import base64\n",
+    "import html\n",
+    "import json\n",
+    "import os\n",
+    "import subprocess\n",
+    "import time\n",
+    "import urllib.error\n",
+    "import urllib.request\n",
+    "from pathlib import Path\n",
+    "from IPython.display import HTML, Image, display\n",
+    "\n",
+    "\n",
+    "def api_root_url(base_url: str) -> str:\n",
+    "    normalized = base_url.rstrip(\"/\")\n",
+    "    if not normalized.endswith(\"/v1\"):\n",
+    "        normalized = f\"{normalized}/v1\"\n",
+    "    return normalized\n",
+    "\n",
+    "\n",
+    "def server_root_url(base_url: str) -> str:\n",
+    "    normalized = base_url.rstrip(\"/\")\n",
+    "    if normalized.endswith(\"/v1\"):\n",
+    "        normalized = normalized[:-3]\n",
+    "    return normalized.rstrip(\"/\")\n",
+    "\n",
+    "\n",
+    "def health_url(base_url: str) -> str:\n",
+    "    return f\"{server_root_url(base_url)}/health\"\n",
+    "\n",
+    "\n",
+    "def video_api_url(base_url: str) -> str:\n",
+    "    return f\"{api_root_url(base_url)}/videos/generations\"\n",
+    "\n",
+    "\n",
+    "IMAGE_EXTENSIONS = {\".jpg\", \".jpeg\", \".png\", \".webp\", \".bmp\"}\n",
+    "\n",
+    "FIXED_SAMPLING = {\n",
+    "    \"num_steps\": 35,\n",
+    "    \"guidance\": 6.0,\n",
+    "    \"fps\": 24,\n",
+    "    \"num_frames\": 189,\n",
+    "    \"resolution\": \"720\",\n",
+    "    \"aspect_ratio\": \"16,9\",\n",
+    "    \"seed\": 0,\n",
+    "}\n",
+    "\n",
+    "TRTLLM_EXTRA_PARAMS = {\n",
+    "    \"use_resolution_template\": False,\n",
+    "    \"use_duration_template\": False,\n",
+    "    \"use_system_prompt\": False,\n",
+    "    \"use_guardrails\": True,\n",
+    "}\n",
+    "\n",
+    "# TensorRT-LLM Cosmos3 currently uses the stable video endpoint path here.\n",
+    "# Text-to-image endpoint behavior is intentionally left out until it is validated\n",
+    "# for the loaded TRT-LLM build.\n",
+    "ASSET_SETS = {\n",
+    "    \"t2v_nano\": {\n",
+    "        \"model\": \"Cosmos3-Nano\",\n",
+    "        \"mode\": \"text2video\",\n",
+    "        \"prompt\": \"assets/prompts/text2video/robot_kitchen.json\",\n",
+    "    },\n",
+    "    \"i2v_nano\": {\n",
+    "        \"model\": \"Cosmos3-Nano\",\n",
+    "        \"mode\": \"image2video\",\n",
+    "        \"prompt\": \"assets/prompts/image2video/car_driving.json\",\n",
+    "        \"image\": \"assets/images/image2video/car_driving.jpg\",\n",
+    "    },\n",
+    "    \"t2v_super\": {\n",
+    "        \"model\": \"Cosmos3-Super\",\n",
+    "        \"mode\": \"text2video\",\n",
+    "        \"prompt\": \"assets/prompts/text2video/robot_kitchen.json\",\n",
+    "    },\n",
+    "    \"i2v_super\": {\n",
+    "        \"model\": \"Cosmos3-Super\",\n",
+    "        \"mode\": \"image2video\",\n",
+    "        \"prompt\": \"assets/prompts/image2video/car_driving.json\",\n",
+    "        \"image\": \"assets/images/image2video/car_driving.jpg\",\n",
+    "    },\n",
+    "}\n",
+    "\n",
+    "\n",
+    "def asset_path(relative_path: str) -> Path:\n",
+    "    path = COSMOS3_AUDIOVISUAL_ROOT / relative_path\n",
+    "    if not path.exists():\n",
+    "        raise FileNotFoundError(path)\n",
+    "    return path.resolve()\n",
+    "\n",
+    "\n",
+    "def compact_json_file(path: Path) -> str:\n",
+    "    return json.dumps(json.loads(path.read_text()), ensure_ascii=True, separators=(\",\", \":\"))\n",
+    "\n",
+    "\n",
+    "def payload_dimensions(payload: dict) -> tuple[int, int]:\n",
+    "    if payload.get(\"resolution\") == \"720\" and payload.get(\"aspect_ratio\") == \"16,9\":\n",
+    "        return 720, 1280\n",
+    "    if payload.get(\"resolution\") == \"256\" and payload.get(\"aspect_ratio\") == \"16,9\":\n",
+    "        return 192, 320\n",
+    "    raise ValueError(f\"Unsupported payload resolution/aspect ratio: {payload.get('resolution')} {payload.get('aspect_ratio')}\")\n",
+    "\n",
+    "\n",
+    "def resolve_payload_path(payload_path: Path, value: str) -> Path:\n",
+    "    path = Path(value)\n",
+    "    if path.is_absolute():\n",
+    "        return path\n",
+    "    return (payload_path.parent / path).resolve()\n",
+    "\n",
+    "\n",
+    "def create_payload(use_case: str, *, backend: str = \"trt_llm\") -> tuple[Path, Path, str]:\n",
+    "    spec = ASSET_SETS[use_case]\n",
+    "    payload_dir = Path(os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"]) / backend / \"payloads\" / use_case\n",
+    "    output_dir = Path(os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"]) / backend / use_case\n",
+    "    payload_dir.mkdir(parents=True, exist_ok=True)\n",
+    "    output_dir.mkdir(parents=True, exist_ok=True)\n",
+    "\n",
+    "    prompt_path = asset_path(spec[\"prompt\"])\n",
+    "    negative_prompt_path = asset_path(f\"assets/negative_prompts/{spec['mode']}/neg_prompt.json\")\n",
+    "    payload_path = payload_dir / f\"{use_case}.json\"\n",
+    "    payload = {\n",
+    "        \"model_mode\": spec[\"mode\"],\n",
+    "        \"name\": use_case,\n",
+    "        \"prompt\": compact_json_file(prompt_path),\n",
+    "        \"negative_prompt\": compact_json_file(negative_prompt_path),\n",
+    "        \"extra_params\": dict(TRTLLM_EXTRA_PARAMS),\n",
+    "        **FIXED_SAMPLING,\n",
+    "    }\n",
+    "    if spec[\"mode\"] == \"image2video\":\n",
+    "        image_path = asset_path(spec[\"image\"])\n",
+    "        payload[\"vision_path\"] = os.path.relpath(image_path, payload_path.parent)\n",
+    "\n",
+    "    payload_path.write_text(json.dumps(payload, indent=2) + \"\\n\")\n",
+    "\n",
+    "    os.environ[f\"COSMOS3_{backend.upper()}_{use_case.upper()}_INPUT\"] = str(payload_path)\n",
+    "    os.environ[f\"COSMOS3_{backend.upper()}_{use_case.upper()}_OUTPUT\"] = str(output_dir)\n",
+    "\n",
+    "    print(f\"model:   {spec['model']}\")\n",
+    "    print(f\"payload: {payload_path}\")\n",
+    "    print(f\"output:  {output_dir}\")\n",
+    "    print(f\"prompt:  {prompt_path.relative_to(COSMOS_ROOT)}\")\n",
+    "    if \"vision_path\" in payload:\n",
+    "        image_display_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n",
+    "        print(f\"image:   {image_display_path.relative_to(COSMOS_ROOT)}\")\n",
+    "        display(Image(filename=str(image_display_path), width=420))\n",
+    "    print(json.dumps({k: payload[k] for k in [\"model_mode\", \"name\", \"num_steps\", \"guidance\", \"fps\", \"num_frames\", \"resolution\", \"aspect_ratio\", \"seed\", \"extra_params\"]}, indent=2))\n",
+    "    return payload_path, output_dir, spec[\"model\"]\n",
+    "\n",
+    "\n",
+    "def check_trtllm_server(model: str, timeout_s: int = 1800, interval_s: int = 10) -> None:\n",
+    "    url = health_url(TRTLLM_ENDPOINTS[model])\n",
+    "    deadline = time.time() + timeout_s\n",
+    "    print(f\"waiting for {model} server: {url}\")\n",
+    "    last_error = None\n",
+    "    while time.time() < deadline:\n",
+    "        try:\n",
+    "            with urllib.request.urlopen(url, timeout=5) as response:\n",
+    "                if response.status == 200:\n",
+    "                    print(f\"{model} server is ready\")\n",
+    "                    return\n",
+    "                last_error = f\"HTTP {response.status}\"\n",
+    "        except (urllib.error.URLError, TimeoutError, OSError) as exc:\n",
+    "            last_error = str(exc)\n",
+    "        print(f\"not ready yet: {last_error}\")\n",
+    "        time.sleep(interval_s)\n",
+    "    raise TimeoutError(f\"Timed out waiting for {model} server at {url}. Last error: {last_error}\")\n",
+    "\n",
+    "\n",
+    "def build_trtllm_video_body(payload: dict) -> dict:\n",
+    "    height, width = payload_dimensions(payload)\n",
+    "    return {\n",
+    "        \"prompt\": payload[\"prompt\"],\n",
+    "        \"negative_prompt\": payload[\"negative_prompt\"],\n",
+    "        \"size\": f\"{width}x{height}\",\n",
+    "        \"num_frames\": payload[\"num_frames\"],\n",
+    "        \"fps\": payload[\"fps\"],\n",
+    "        \"num_inference_steps\": payload[\"num_steps\"],\n",
+    "        \"guidance_scale\": payload[\"guidance\"],\n",
+    "        \"seed\": payload[\"seed\"],\n",
+    "        \"format\": \"auto\",\n",
+    "        \"extra_params\": payload[\"extra_params\"],\n",
+    "    }\n",
+    "\n",
+    "\n",
+    "def _auth_headers() -> list[str]:\n",
+    "    api_key = os.environ.get(\"COSMOS3_TRTLLM_API_KEY\", \"\")\n",
+    "    return [\"-H\", f\"Authorization: Bearer {api_key}\"] if api_key else []\n",
+    "\n",
+    "\n",
+    "def _video_extension_from_response(header_text: str, content_path: Path) -> str:\n",
+    "    lowered = header_text.lower()\n",
+    "    if \"video/x-msvideo\" in lowered or \"video/avi\" in lowered:\n",
+    "        return \".avi\"\n",
+    "    head = content_path.read_bytes()[:16]\n",
+    "    if head.startswith(b\"RIFF\") and b\"AVI\" in head[:12]:\n",
+    "        return \".avi\"\n",
+    "    if len(head) >= 12 and head[4:8] == b\"ftyp\":\n",
+    "        return \".mp4\"\n",
+    "    return \".mp4\"\n",
+    "\n",
+    "\n",
+    "def post_video(*, payload_path: Path, payload: dict, output_stem: Path, model: str) -> Path:\n",
+    "    url = video_api_url(TRTLLM_ENDPOINTS[model])\n",
+    "    tmp_path = Path(f\"{output_stem}.tmp\")\n",
+    "    header_path = Path(f\"{output_stem}.headers.txt\")\n",
+    "    error_path = Path(f\"{output_stem}.error.txt\")\n",
+    "    for path in [tmp_path, header_path, error_path]:\n",
+    "        if path.exists():\n",
+    "            path.unlink()\n",
+    "\n",
+    "    body = build_trtllm_video_body(payload)\n",
+    "    cmd = [\n",
+    "        \"curl\",\n",
+    "        \"-sS\",\n",
+    "        \"--fail-with-body\",\n",
+    "        \"-X\",\n",
+    "        \"POST\",\n",
+    "        url,\n",
+    "        \"-D\",\n",
+    "        str(header_path),\n",
+    "        \"-H\",\n",
+    "        \"Accept: video/mp4, video/x-msvideo, application/octet-stream\",\n",
+    "    ]\n",
+    "    cmd += _auth_headers()\n",
+    "\n",
+    "    if payload[\"model_mode\"] == \"image2video\":\n",
+    "        form_body = dict(body)\n",
+    "        form_body[\"extra_params\"] = json.dumps(form_body[\"extra_params\"], separators=(\",\", \":\"))\n",
+    "        for key, value in form_body.items():\n",
+    "            cmd += [\"--form-string\", f\"{key}={value}\"]\n",
+    "        image_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n",
+    "        cmd += [\"-F\", f\"input_reference=@{image_path}\"]\n",
+    "    else:\n",
+    "        cmd += [\"-H\", \"Content-Type: application/json\"]\n",
+    "        cmd += [\"-d\", json.dumps(body, separators=(\",\", \":\"))]\n",
+    "\n",
+    "    cmd += [\"-o\", str(tmp_path)]\n",
+    "    result = subprocess.run(cmd, text=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE)\n",
+    "    if result.returncode != 0:\n",
+    "        error_path.write_text((result.stdout or \"\") + (result.stderr or \"\"))\n",
+    "        raise RuntimeError(f\"TensorRT-LLM request failed with exit code {result.returncode}; see {error_path}\")\n",
+    "\n",
+    "    header_text = header_path.read_text() if header_path.exists() else \"\"\n",
+    "    ext = _video_extension_from_response(header_text, tmp_path)\n",
+    "    output_path = output_stem.with_suffix(ext)\n",
+    "    if output_path.exists():\n",
+    "        output_path.unlink()\n",
+    "    tmp_path.replace(output_path)\n",
+    "    return output_path\n",
+    "\n",
+    "\n",
+    "def run_trtllm_payload(payload_path: Path, output_dir: str | Path, *, model: str) -> Path:\n",
+    "    payload_path = Path(payload_path)\n",
+    "    output_dir = Path(output_dir)\n",
+    "    output_dir.mkdir(parents=True, exist_ok=True)\n",
+    "    payload = json.loads(payload_path.read_text())\n",
+    "    output_stem = output_dir / payload[\"name\"]\n",
+    "    endpoint = video_api_url(TRTLLM_ENDPOINTS[model])\n",
+    "    print(\"endpoint:\", endpoint)\n",
+    "    print(\"payload:\", payload_path)\n",
+    "    print(\"output stem:\", output_stem)\n",
+    "    if payload[\"model_mode\"] == \"image2video\":\n",
+    "        print(\"input image:\", resolve_payload_path(payload_path, payload[\"vision_path\"]))\n",
+    "    t0 = time.time()\n",
+    "    output_path = post_video(payload_path=payload_path, payload=payload, output_stem=output_stem, model=model)\n",
+    "    print(f\"wrote {output_path} in {time.time() - t0:.1f}s\")\n",
+    "    return output_path\n",
+    "\n",
+    "\n",
+    "def display_video(path: Path, *, width: int = 720) -> None:\n",
+    "    suffix = path.suffix.lower()\n",
+    "    media_type = \"video/x-msvideo\" if suffix == \".avi\" else \"video/mp4\"\n",
+    "    data = base64.b64encode(path.read_bytes()).decode(\"ascii\")\n",
+    "    label = html.escape(str(path))\n",
+    "    markup = f\"\"\"\n",
+    "<video controls playsinline preload=\"metadata\" width=\"{width}\" style=\"max-width: 100%; background: #000;\">\n",
+    "  <source src=\"data:{media_type};base64,{data}\" type=\"{media_type}\">\n",
+    "</video>\n",
+    "<div style=\"font-family: monospace; font-size: 12px; margin-top: 4px;\">{label}</div>\n",
+    "\"\"\"\n",
+    "    display(HTML(markup))\n",
+    "\n",
+    "\n",
+    "def view_run(output_dir: str | Path) -> None:\n",
+    "    output_dir = Path(output_dir)\n",
+    "    videos = [\n",
+    "        path\n",
+    "        for path in sorted(output_dir.rglob(\"*\"))\n",
+    "        if path.suffix.lower() in {\".mp4\", \".avi\"}\n",
+    "        and not path.name.endswith((\"_preview.mp4\", \"_browser.mp4\"))\n",
+    "    ]\n",
+    "    if not videos:\n",
+    "        print(f\"No generated videos found under {output_dir}\")\n",
+    "        return\n",
+    "    for src in videos:\n",
+    "        print(f\"source: {src} ({src.stat().st_size // 1024} KB)\")\n",
+    "        display_video(src)\n"
+   ],
+   "id": "helpers-code"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Run each use case top-to-bottom: create the JSON payload, wait for the matching server, run inference, then view the generated media. The Cosmos3-Nano and Cosmos3-Super sections are independent, so you can run just one.\n"
+   ],
+   "id": "run-note"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cosmos3-Nano Examples\n",
+    "\n",
+    "Use cases for the `Cosmos3-Nano` model. This section is self-contained; you can run it without the Cosmos3-Super section below.\n"
+   ],
+   "id": "nano-title"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Nano: Text to Video\n",
+    "\n",
+    "Nano text-to-video generation using a structured JSON prompt.\n"
+   ],
+   "id": "nano-t2v"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t2v_nano_payload, t2v_nano_output, t2v_nano_model = create_payload(\"t2v_nano\")\n"
+   ],
+   "id": "nano-t2v-payload"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run\n"
+   ],
+   "id": "nano-t2v-run-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_trtllm_server(t2v_nano_model)\n",
+    "run_trtllm_payload(t2v_nano_payload, t2v_nano_output, model=t2v_nano_model)\n"
+   ],
+   "id": "nano-t2v-run"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View Results\n"
+   ],
+   "id": "nano-t2v-view-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_run(t2v_nano_output)\n"
+   ],
+   "id": "nano-t2v-view"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Nano: Image to Video\n",
+    "\n",
+    "Nano image-to-video generation using its paired image asset.\n"
+   ],
+   "id": "nano-i2v"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i2v_nano_payload, i2v_nano_output, i2v_nano_model = create_payload(\"i2v_nano\")\n"
+   ],
+   "id": "nano-i2v-payload"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run\n"
+   ],
+   "id": "nano-i2v-run-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_trtllm_server(i2v_nano_model)\n",
+    "run_trtllm_payload(i2v_nano_payload, i2v_nano_output, model=i2v_nano_model)\n"
+   ],
+   "id": "nano-i2v-run"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View Results\n"
+   ],
+   "id": "nano-i2v-view-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_run(i2v_nano_output)\n"
+   ],
+   "id": "nano-i2v-view"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Cosmos3-Super Examples\n",
+    "\n",
+    "The same use cases for the larger `Cosmos3-Super` model. This section is self-contained; you can run it without the Cosmos3-Nano section above.\n"
+   ],
+   "id": "super-title"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Super: Text to Video\n",
+    "\n",
+    "Super text-to-video generation using the same structured JSON prompt.\n"
+   ],
+   "id": "super-t2v"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t2v_super_payload, t2v_super_output, t2v_super_model = create_payload(\"t2v_super\")\n"
+   ],
+   "id": "super-t2v-payload"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run\n"
+   ],
+   "id": "super-t2v-run-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_trtllm_server(t2v_super_model)\n",
+    "run_trtllm_payload(t2v_super_payload, t2v_super_output, model=t2v_super_model)\n"
+   ],
+   "id": "super-t2v-run"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View Results\n"
+   ],
+   "id": "super-t2v-view-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_run(t2v_super_output)\n"
+   ],
+   "id": "super-t2v-view"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Super: Image to Video\n",
+    "\n",
+    "Super image-to-video generation using its paired image asset.\n"
+   ],
+   "id": "super-i2v"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "i2v_super_payload, i2v_super_output, i2v_super_model = create_payload(\"i2v_super\")\n"
+   ],
+   "id": "super-i2v-payload"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run\n"
+   ],
+   "id": "super-i2v-run-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_trtllm_server(i2v_super_model)\n",
+    "run_trtllm_payload(i2v_super_payload, i2v_super_output, model=i2v_super_model)\n"
+   ],
+   "id": "super-i2v-run"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View Results\n"
+   ],
+   "id": "super-i2v-view-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_run(i2v_super_output)\n"
+   ],
+   "id": "super-i2v-view"
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "base",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.7"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

From 373591a79da22286f30afb54693c06a7a6f84834 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 23 Jun 2026 16:07:49 +0200
Subject: [PATCH 2/6] Updated trt-llm notebook

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 README.md                                     |  6 +++---
 cookbooks/cosmos3/README.md                   | 11 +++++-----
 .../cosmos3/generator/audiovisual/README.md   | 18 ++++++++---------
 .../audiovisual/run_with_trt_llm.ipynb        | 20 ++++++++++++-------
 4 files changed, 30 insertions(+), 25 deletions(-)

diff --git a/README.md b/README.md
index 6d07884c..94fa110c 100644
--- a/README.md
+++ b/README.md
@@ -209,9 +209,9 @@ Set `HF_HOME` if you want to use a shared cache or a disk with more space.
 Generator requires the Guardrail. Request access to the gated
 [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail)
 HF repository. To disable the guardrail, set `enable_safety_checker=False` (Diffusers),
-`TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` or `use_guardrails: false`
-(TensorRT-LLM), `guardrails: false` (vLLM-Omni `extra_params`/`extra_args`), or
-`--no-guardrails` (Cosmos Framework).
+`TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` (TensorRT-LLM; newer builds also support
+`use_guardrails: false` through `extra_params`), `guardrails: false` (vLLM-Omni
+`extra_params`/`extra_args`), or `--no-guardrails` (Cosmos Framework).
 
 #### Generator with Diffusers
 
diff --git a/cookbooks/cosmos3/README.md b/cookbooks/cosmos3/README.md
index 0d6fa54d..7582a6f1 100644
--- a/cookbooks/cosmos3/README.md
+++ b/cookbooks/cosmos3/README.md
@@ -30,9 +30,10 @@ backend you want to run and follow that one section.
   ```
 
   To disable the guardrail, set `enable_safety_checker=False` (Diffusers),
-  `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` or `use_guardrails: false`
-  (TensorRT-LLM), `guardrails: false` (vLLM-Omni
-  `extra_params`/`extra_args`), or `--no-guardrails` (Cosmos Framework).
+  `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` (TensorRT-LLM; newer builds also
+  support `use_guardrails: false` through `extra_params`), `guardrails: false`
+  (vLLM-Omni `extra_params`/`extra_args`), or `--no-guardrails` (Cosmos
+  Framework).
 - For the Cosmos Framework backend: access to `git@github.com:NVIDIA/cosmos-framework.git`.
 - For the NIM backend: an NGC API key (used as `NGC_API_KEY`), which you can generate on [build.nvidia.com](https://build.nvidia.com/nvidia/cosmos3-nano-reasoner) or [NGC](https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/cosmos3-reasoner), plus a one-time `docker login nvcr.io` (username `$oauthtoken`, password = your key). The HF login above is not needed for NIM.
 - Enough local disk for the venv/image, the uv cache, and the model cache. Nano
@@ -206,8 +207,8 @@ torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve \
 
 The server exposes `/health`, `/v1/videos/generations`, `/v1/videos`, and
 `/v1/images/generations`. The audiovisual notebook uses the validated video
-generation endpoint for text-to-video and image-to-video. If `ffmpeg` is
-available, `format="auto"` returns MP4; otherwise TensorRT-LLM falls back to AVI.
+generation endpoint for text-to-video and image-to-video and leaves the output
+format at the server default for compatibility across TensorRT-LLM builds.
 
 ## Transformers
 
diff --git a/cookbooks/cosmos3/generator/audiovisual/README.md b/cookbooks/cosmos3/generator/audiovisual/README.md
index 1fed17b8..928c6346 100644
--- a/cookbooks/cosmos3/generator/audiovisual/README.md
+++ b/cookbooks/cosmos3/generator/audiovisual/README.md
@@ -13,8 +13,9 @@ Generator requires the Guardrail. Request access to the gated
 [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail)
 HF repository before running these examples. To disable the guardrail, set
 `enable_safety_checker=False` (Diffusers), `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1`
-or `use_guardrails: false` (TensorRT-LLM), `guardrails: false` (vLLM-Omni
-`extra_params`/`extra_args`), or `--no-guardrails` (Cosmos Framework).
+(TensorRT-LLM; newer builds also support `use_guardrails: false` through
+`extra_params`), `guardrails: false` (vLLM-Omni `extra_params`/`extra_args`),
+or `--no-guardrails` (Cosmos Framework).
 
 ## Run with Cosmos Framework
 
@@ -211,18 +212,11 @@ response = requests.post(
         "prompt": json.dumps(prompt, ensure_ascii=True, separators=(",", ":")),
         "negative_prompt": json.dumps(negative, ensure_ascii=True, separators=(",", ":")),
         "size": "1280x720",
-        "num_frames": 189,
+        "seconds": 189 / 24,
         "fps": 24,
         "num_inference_steps": 35,
         "guidance_scale": 6.0,
         "seed": 0,
-        "format": "auto",
-        "extra_params": {
-            "use_resolution_template": False,
-            "use_duration_template": False,
-            "use_system_prompt": False,
-            "use_guardrails": True,
-        },
     },
 )
 response.raise_for_status()
@@ -234,6 +228,10 @@ For image-to-video, post multipart form data to the same endpoint with the
 reference image under `input_reference`. TensorRT-LLM Cosmos3 audio/action
 generation is not covered by this backend section.
 
+Some TensorRT-LLM builds also accept model-specific `extra_params` such as
+`use_resolution_template`, `use_duration_template`, `use_system_prompt`, and
+`use_guardrails`. The notebook leaves these off by default for compatibility.
+
 ### Notebook walkthrough
 
 [`run_with_trt_llm.ipynb`](./run_with_trt_llm.ipynb) is the full tutorial for the
diff --git a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
index 3668f7f6..20b6f8e0 100644
--- a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
+++ b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
@@ -29,13 +29,15 @@
     "\n",
     "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Video generation uses `/v1/videos/generations`.\n",
     "\n",
-    "Generator requires the Guardrail. Request access to the gated [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) HF repository before running these examples. TensorRT-LLM loads guardrails by default; to disable them, set `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` before starting the server or set `use_guardrails` to `False` in the request `extra_params`.\n",
+    "Generator requires the Guardrail. Request access to the gated [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) HF repository before running these examples. TensorRT-LLM loads guardrails by default; to disable them, set `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` before starting the server. Newer TensorRT-LLM builds also accept per-request `extra_params`; set `COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS=1` before running the setup cell only if your server schema supports that field.\n",
     "\n",
     "```bash\n",
     "export COSMOS3_TRTLLM_BASE_URL=http://localhost:8000\n",
     "export COSMOS3_TRTLLM_NANO_BASE_URL=http://localhost:8000\n",
     "export COSMOS3_TRTLLM_SUPER_BASE_URL=http://localhost:8000\n",
     "export COSMOS3_TRTLLM_API_KEY=tensorrt_llm\n",
+    "# Optional for newer TensorRT-LLM builds that accept VideoGenerationRequest.extra_params:\n",
+    "# export COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS=1\n",
     "```\n"
    ],
    "id": "prerequisites"
@@ -68,7 +70,7 @@
     "torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve   nvidia/Cosmos3-Super   --visual_gen_args \"$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-super-4gpu.yaml\"   --port 8000\n",
     "```\n",
     "\n",
-    "TensorRT-LLM exposes `/health` when the server is ready. If `ffmpeg` is installed, `format=\"auto\"` returns MP4; otherwise TensorRT-LLM falls back to AVI.\n"
+    "TensorRT-LLM exposes `/health` when the server is ready. The notebook leaves the response format at the server default so it works with both older and newer TensorRT-LLM VisualGen schemas.\n"
    ],
    "id": "start-server"
   },
@@ -112,9 +114,11 @@
     "\n",
     "os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"] = str(COSMOS3_AUDIOVISUAL_OUTPUT_ROOT)\n",
     "os.environ.setdefault(\"COSMOS3_TRTLLM_API_KEY\", \"tensorrt_llm\")\n",
+    "INCLUDE_TRTLLM_EXTRA_PARAMS = os.environ.get(\"COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS\", \"0\") == \"1\"\n",
     "\n",
     "print(\"COSMOS_ROOT:\", COSMOS_ROOT)\n",
     "print(\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT:\", COSMOS3_AUDIOVISUAL_OUTPUT_ROOT)\n",
+    "print(\"COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS:\", INCLUDE_TRTLLM_EXTRA_PARAMS)\n",
     "for model, endpoint in TRTLLM_ENDPOINTS.items():\n",
     "    print(f\"{model} endpoint: {endpoint}\")\n"
    ],
@@ -396,18 +400,19 @@
     "\n",
     "def build_trtllm_video_body(payload: dict) -> dict:\n",
     "    height, width = payload_dimensions(payload)\n",
-    "    return {\n",
+    "    body = {\n",
     "        \"prompt\": payload[\"prompt\"],\n",
     "        \"negative_prompt\": payload[\"negative_prompt\"],\n",
     "        \"size\": f\"{width}x{height}\",\n",
-    "        \"num_frames\": payload[\"num_frames\"],\n",
+    "        \"seconds\": payload[\"num_frames\"] / payload[\"fps\"],\n",
     "        \"fps\": payload[\"fps\"],\n",
     "        \"num_inference_steps\": payload[\"num_steps\"],\n",
     "        \"guidance_scale\": payload[\"guidance\"],\n",
     "        \"seed\": payload[\"seed\"],\n",
-    "        \"format\": \"auto\",\n",
-    "        \"extra_params\": payload[\"extra_params\"],\n",
     "    }\n",
+    "    if INCLUDE_TRTLLM_EXTRA_PARAMS:\n",
+    "        body[\"extra_params\"] = payload[\"extra_params\"]\n",
+    "    return body\n",
     "\n",
     "\n",
     "def _auth_headers() -> list[str]:\n",
@@ -453,7 +458,8 @@
     "\n",
     "    if payload[\"model_mode\"] == \"image2video\":\n",
     "        form_body = dict(body)\n",
-    "        form_body[\"extra_params\"] = json.dumps(form_body[\"extra_params\"], separators=(\",\", \":\"))\n",
+    "        if \"extra_params\" in form_body:\n",
+    "            form_body[\"extra_params\"] = json.dumps(form_body[\"extra_params\"], separators=(\",\", \":\"))\n",
     "        for key, value in form_body.items():\n",
     "            cmd += [\"--form-string\", f\"{key}={value}\"]\n",
     "        image_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n",

From 3b690c48abc6a9603b23e7efe422f0582f5fff2e Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 23 Jun 2026 16:38:13 +0200
Subject: [PATCH 3/6] Added t2i

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 README.md                                     |   6 +-
 cookbooks/cosmos3/README.md                   |  29 ++--
 .../cosmos3/generator/audiovisual/README.md   |  31 ++--
 .../audiovisual/run_with_trt_llm.ipynb        | 158 ++++++++++++++++--
 4 files changed, 183 insertions(+), 41 deletions(-)

diff --git a/README.md b/README.md
index 94fa110c..7b153442 100644
--- a/README.md
+++ b/README.md
@@ -209,8 +209,8 @@ Set `HF_HOME` if you want to use a shared cache or a disk with more space.
 Generator requires the Guardrail. Request access to the gated
 [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail)
 HF repository. To disable the guardrail, set `enable_safety_checker=False` (Diffusers),
-`TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` (TensorRT-LLM; newer builds also support
-`use_guardrails: false` through `extra_params`), `guardrails: false` (vLLM-Omni
+`TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` or `use_guardrails: false` through
+`extra_params` (TensorRT-LLM), `guardrails: false` (vLLM-Omni
 `extra_params`/`extra_args`), or `--no-guardrails` (Cosmos Framework).
 
 #### Generator with Diffusers
@@ -747,7 +747,7 @@ We are building examples that show Cosmos 3 capabilities end to end, including w
 | Generator (audiovisual) with Diffusers | Generator | Text-to-image, plus text-to-video and image-to-video each with or without synchronized sound, via `Cosmos3OmniPipeline`. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_diffusers.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_diffusers.ipynb) |
 | Generator (audiovisual) with Cosmos Framework | Generator | Text-to-image, plus text-to-video and image-to-video each with sound on or off, through the `cosmos_framework.scripts.inference` entrypoint. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_cosmos_framework.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_cosmos_framework.ipynb) |
 | Generator (audiovisual) with vLLM-Omni | Generator | Text-to-image, plus text-to-video and image-to-video each with sound on or off, against an OpenAI-compatible vLLM-Omni server. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_vllm_omni.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_vllm_omni.ipynb) |
-| Generator (audiovisual) with TensorRT-LLM | Generator | Text-to-video and image-to-video against an OpenAI-compatible TensorRT-LLM VisualGen server. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb) |
+| Generator (audiovisual) with TensorRT-LLM | Generator | Text-to-image, text-to-video, and image-to-video against an OpenAI-compatible TensorRT-LLM VisualGen server. | [Notebook](cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb) |
 | Forward dynamics with Cosmos Framework | Generator | Forward dynamics: action-conditioned future-observation prediction for AV, DROID, and UMI, through the `cosmos_framework.scripts.inference` entrypoint. | [Notebook](cookbooks/cosmos3/generator/action/run_fd_with_cosmos_framework.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/action/run_fd_with_cosmos_framework.ipynb) |
 | Forward dynamics with vLLM-Omni | Generator | Forward dynamics: action-conditioned future-observation prediction for AV, DROID, and UMI, against an OpenAI-compatible vLLM-Omni server. | [Notebook](cookbooks/cosmos3/generator/action/run_fd_with_vllm.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/action/run_fd_with_vllm.ipynb) |
 | Inverse dynamics with Cosmos Framework | Generator | Inverse dynamics: ego-motion trajectory prediction from input AV video, through the `cosmos_framework.scripts.inference` entrypoint. | [Notebook](cookbooks/cosmos3/generator/action/run_id_with_cosmos_framework.ipynb) | [![Render with nbviewer](https://raw.githubusercontent.com/jupyter/design/master/logos/Badges/nbviewer_badge.svg)](https://nbviewer.org/github/nvidia/cosmos/blob/main/cookbooks/cosmos3/generator/action/run_id_with_cosmos_framework.ipynb) |
diff --git a/cookbooks/cosmos3/README.md b/cookbooks/cosmos3/README.md
index 7582a6f1..3a367cae 100644
--- a/cookbooks/cosmos3/README.md
+++ b/cookbooks/cosmos3/README.md
@@ -30,10 +30,9 @@ backend you want to run and follow that one section.
   ```
 
   To disable the guardrail, set `enable_safety_checker=False` (Diffusers),
-  `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` (TensorRT-LLM; newer builds also
-  support `use_guardrails: false` through `extra_params`), `guardrails: false`
-  (vLLM-Omni `extra_params`/`extra_args`), or `--no-guardrails` (Cosmos
-  Framework).
+  `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` or `use_guardrails: false` through
+  `extra_params` (TensorRT-LLM), `guardrails: false` (vLLM-Omni
+  `extra_params`/`extra_args`), or `--no-guardrails` (Cosmos Framework).
 - For the Cosmos Framework backend: access to `git@github.com:NVIDIA/cosmos-framework.git`.
 - For the NIM backend: an NGC API key (used as `NGC_API_KEY`), which you can generate on [build.nvidia.com](https://build.nvidia.com/nvidia/cosmos3-nano-reasoner) or [NGC](https://catalog.ngc.nvidia.com/orgs/nim/teams/nvidia/containers/cosmos3-reasoner), plus a one-time `docker login nvcr.io` (username `$oauthtoken`, password = your key). The HF login above is not needed for NIM.
 - Enough local disk for the venv/image, the uv cache, and the model cache. Nano
@@ -166,14 +165,20 @@ uv pip install --torch-backend=cu130 \
 
 ## TensorRT-LLM
 
-OpenAI-compatible **VisualGen** server for Generator audiovisual text-to-video
-and image-to-video examples. Cosmos3 support was added in TensorRT-LLM PR
+OpenAI-compatible **VisualGen** server for Generator audiovisual text-to-image,
+text-to-video, and image-to-video examples. Cosmos3 support was added in TensorRT-LLM PR
 [#14824](https://github.com/NVIDIA/TensorRT-LLM/pull/14824); use a
 TensorRT-LLM checkout or package that includes that change.
 
-Install TensorRT-LLM following its upstream documentation, then install the
-Cosmos3 guardrail package in the same environment unless you explicitly disable
-guardrails before starting the server:
+Install TensorRT-LLM following its upstream documentation, or run the latest
+release container instead of installing it manually:
+
+```bash
+docker pull nvcr.io/nvidia/tensorrt-llm/release:latest
+```
+
+Then install the Cosmos3 guardrail package in the same environment unless you
+explicitly disable guardrails before starting the server:
 
 ```bash
 pip install cosmos_guardrail==0.3.0
@@ -207,8 +212,10 @@ torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve \
 
 The server exposes `/health`, `/v1/videos/generations`, `/v1/videos`, and
 `/v1/images/generations`. The audiovisual notebook uses the validated video
-generation endpoint for text-to-video and image-to-video and leaves the output
-format at the server default for compatibility across TensorRT-LLM builds.
+generation endpoint for text-to-image, text-to-video, and image-to-video. Cosmos3
+text-to-image is sent as a one-frame video request, matching the TensorRT-LLM
+Cosmos3 pipeline. Requests send Cosmos3 controls through `extra_params`, so use a
+TensorRT-LLM build that includes the Cosmos3 VisualGen API schema.
 
 ## Transformers
 
diff --git a/cookbooks/cosmos3/generator/audiovisual/README.md b/cookbooks/cosmos3/generator/audiovisual/README.md
index 928c6346..ace73c5c 100644
--- a/cookbooks/cosmos3/generator/audiovisual/README.md
+++ b/cookbooks/cosmos3/generator/audiovisual/README.md
@@ -13,9 +13,9 @@ Generator requires the Guardrail. Request access to the gated
 [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail)
 HF repository before running these examples. To disable the guardrail, set
 `enable_safety_checker=False` (Diffusers), `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1`
-(TensorRT-LLM; newer builds also support `use_guardrails: false` through
-`extra_params`), `guardrails: false` (vLLM-Omni `extra_params`/`extra_args`),
-or `--no-guardrails` (Cosmos Framework).
+or `use_guardrails: false` through `extra_params` (TensorRT-LLM),
+`guardrails: false` (vLLM-Omni `extra_params`/`extra_args`), or
+`--no-guardrails` (Cosmos Framework).
 
 ## Run with Cosmos Framework
 
@@ -217,6 +217,12 @@ response = requests.post(
         "num_inference_steps": 35,
         "guidance_scale": 6.0,
         "seed": 0,
+        "extra_params": {
+            "use_resolution_template": False,
+            "use_duration_template": False,
+            "use_system_prompt": False,
+            "use_guardrails": True,
+        },
     },
 )
 response.raise_for_status()
@@ -228,14 +234,19 @@ For image-to-video, post multipart form data to the same endpoint with the
 reference image under `input_reference`. TensorRT-LLM Cosmos3 audio/action
 generation is not covered by this backend section.
 
-Some TensorRT-LLM builds also accept model-specific `extra_params` such as
-`use_resolution_template`, `use_duration_template`, `use_system_prompt`, and
-`use_guardrails`. The notebook leaves these off by default for compatibility.
+For text-to-image, use the same video generation endpoint with `seconds=1 / 24`
+and `fps=24`; TensorRT-LLM Cosmos3 returns a one-frame video response for this
+path.
+
+The TRT-LLM notebook always sends model-specific `extra_params`, so use a
+TensorRT-LLM release with the Cosmos3 VisualGen API schema. The latest release
+container is available at `nvcr.io/nvidia/tensorrt-llm/release:latest`.
 
 ### Notebook walkthrough
 
 [`run_with_trt_llm.ipynb`](./run_with_trt_llm.ipynb) is the full tutorial for the
-TensorRT-LLM backend: it walks through text-to-video and image-to-video requests
-against an already-running VisualGen server. Server launch options (Nano and
-Super, FP8 dynamic quantization, CFG parallelism, Ulysses, and parallel VAE)
-live in the [shared environment setup guide](../../README.md#tensorrt-llm).
+TensorRT-LLM backend: it walks through text-to-image, text-to-video, and
+image-to-video requests against an already-running VisualGen server. Server
+launch options (Nano and Super, FP8 dynamic quantization, CFG parallelism,
+Ulysses, and parallel VAE) live in the
+[shared environment setup guide](../../README.md#tensorrt-llm).
diff --git a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
index 20b6f8e0..fb18bae8 100644
--- a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
+++ b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
@@ -17,7 +17,7 @@
     "\n",
     "This notebook calls already-running TensorRT-LLM VisualGen servers with direct `curl` requests from Python.\n",
     "\n",
-    "The examples are split into Cosmos3-Nano and Cosmos3-Super sections. Each section is self-contained, so you can run just one. The notebook covers TensorRT-LLM's stable server flow for text-to-video and image-to-video generation.\n"
+    "The examples are split into Cosmos3-Nano and Cosmos3-Super sections. Each section is self-contained, so you can run just one. The notebook covers TensorRT-LLM's stable server flow for text-to-image, text-to-video, and image-to-video generation.\n"
    ],
    "id": "title"
   },
@@ -27,17 +27,15 @@
    "source": [
     "## 1. Prerequisites\n",
     "\n",
-    "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Video generation uses `/v1/videos/generations`.\n",
+    "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Generation uses `/v1/videos/generations`; Cosmos3 text-to-image runs as a one-frame VisualGen video request.\n",
     "\n",
-    "Generator requires the Guardrail. Request access to the gated [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) HF repository before running these examples. TensorRT-LLM loads guardrails by default; to disable them, set `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` before starting the server. Newer TensorRT-LLM builds also accept per-request `extra_params`; set `COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS=1` before running the setup cell only if your server schema supports that field.\n",
+    "Generator requires the Guardrail. Request access to the gated [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) HF repository before running these examples. TensorRT-LLM loads guardrails by default; to disable them, set `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` before starting the server or set `use_guardrails` to `False` in the request `extra_params`.\n",
     "\n",
     "```bash\n",
     "export COSMOS3_TRTLLM_BASE_URL=http://localhost:8000\n",
     "export COSMOS3_TRTLLM_NANO_BASE_URL=http://localhost:8000\n",
     "export COSMOS3_TRTLLM_SUPER_BASE_URL=http://localhost:8000\n",
     "export COSMOS3_TRTLLM_API_KEY=tensorrt_llm\n",
-    "# Optional for newer TensorRT-LLM builds that accept VideoGenerationRequest.extra_params:\n",
-    "# export COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS=1\n",
     "```\n"
    ],
    "id": "prerequisites"
@@ -48,7 +46,7 @@
    "source": [
     "## 2. Start the Server\n",
     "\n",
-    "Run the TensorRT-LLM VisualGen server before running the request cells. The config YAMLs below come from TensorRT-LLM's Cosmos3 support.\n",
+    "Run the TensorRT-LLM VisualGen server before running the request cells. The config YAMLs below come from TensorRT-LLM's Cosmos3 support. Instead of installing TensorRT-LLM manually, you can use the latest release container: `nvcr.io/nvidia/tensorrt-llm/release:latest`.\n",
     "\n",
     "### Cosmos3-Nano\n",
     "\n",
@@ -70,7 +68,7 @@
     "torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve   nvidia/Cosmos3-Super   --visual_gen_args \"$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-super-4gpu.yaml\"   --port 8000\n",
     "```\n",
     "\n",
-    "TensorRT-LLM exposes `/health` when the server is ready. The notebook leaves the response format at the server default so it works with both older and newer TensorRT-LLM VisualGen schemas.\n"
+    "TensorRT-LLM exposes `/health` when the server is ready. This notebook sends Cosmos3 model-specific controls through `extra_params`, so use a TensorRT-LLM release that includes the Cosmos3 VisualGen API schema.\n"
    ],
    "id": "start-server"
   },
@@ -114,11 +112,9 @@
     "\n",
     "os.environ[\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT\"] = str(COSMOS3_AUDIOVISUAL_OUTPUT_ROOT)\n",
     "os.environ.setdefault(\"COSMOS3_TRTLLM_API_KEY\", \"tensorrt_llm\")\n",
-    "INCLUDE_TRTLLM_EXTRA_PARAMS = os.environ.get(\"COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS\", \"0\") == \"1\"\n",
     "\n",
     "print(\"COSMOS_ROOT:\", COSMOS_ROOT)\n",
     "print(\"COSMOS3_AUDIOVISUAL_OUTPUT_ROOT:\", COSMOS3_AUDIOVISUAL_OUTPUT_ROOT)\n",
-    "print(\"COSMOS3_TRTLLM_INCLUDE_EXTRA_PARAMS:\", INCLUDE_TRTLLM_EXTRA_PARAMS)\n",
     "for model, endpoint in TRTLLM_ENDPOINTS.items():\n",
     "    print(f\"{model} endpoint: {endpoint}\")\n"
    ],
@@ -285,10 +281,14 @@
     "    \"use_guardrails\": True,\n",
     "}\n",
     "\n",
-    "# TensorRT-LLM Cosmos3 currently uses the stable video endpoint path here.\n",
-    "# Text-to-image endpoint behavior is intentionally left out until it is validated\n",
-    "# for the loaded TRT-LLM build.\n",
+    "# TensorRT-LLM Cosmos3 uses the stable video endpoint path here. Text-to-image\n",
+    "# generation is represented as a one-frame video request.\n",
     "ASSET_SETS = {\n",
+    "    \"t2i_nano\": {\n",
+    "        \"model\": \"Cosmos3-Nano\",\n",
+    "        \"mode\": \"text2image\",\n",
+    "        \"prompt\": \"assets/prompts/text2image/robot_draping.json\",\n",
+    "    },\n",
     "    \"t2v_nano\": {\n",
     "        \"model\": \"Cosmos3-Nano\",\n",
     "        \"mode\": \"text2video\",\n",
@@ -300,6 +300,11 @@
     "        \"prompt\": \"assets/prompts/image2video/car_driving.json\",\n",
     "        \"image\": \"assets/images/image2video/car_driving.jpg\",\n",
     "    },\n",
+    "    \"t2i_super\": {\n",
+    "        \"model\": \"Cosmos3-Super\",\n",
+    "        \"mode\": \"text2image\",\n",
+    "        \"prompt\": \"assets/prompts/text2image/robot_draping.json\",\n",
+    "    },\n",
     "    \"t2v_super\": {\n",
     "        \"model\": \"Cosmos3-Super\",\n",
     "        \"mode\": \"text2video\",\n",
@@ -348,16 +353,19 @@
     "    output_dir.mkdir(parents=True, exist_ok=True)\n",
     "\n",
     "    prompt_path = asset_path(spec[\"prompt\"])\n",
-    "    negative_prompt_path = asset_path(f\"assets/negative_prompts/{spec['mode']}/neg_prompt.json\")\n",
     "    payload_path = payload_dir / f\"{use_case}.json\"\n",
     "    payload = {\n",
     "        \"model_mode\": spec[\"mode\"],\n",
     "        \"name\": use_case,\n",
     "        \"prompt\": compact_json_file(prompt_path),\n",
-    "        \"negative_prompt\": compact_json_file(negative_prompt_path),\n",
     "        \"extra_params\": dict(TRTLLM_EXTRA_PARAMS),\n",
     "        **FIXED_SAMPLING,\n",
     "    }\n",
+    "    if spec[\"mode\"] == \"text2image\":\n",
+    "        payload[\"num_frames\"] = 1\n",
+    "    else:\n",
+    "        negative_prompt_path = asset_path(f\"assets/negative_prompts/{spec['mode']}/neg_prompt.json\")\n",
+    "        payload[\"negative_prompt\"] = compact_json_file(negative_prompt_path)\n",
     "    if spec[\"mode\"] == \"image2video\":\n",
     "        image_path = asset_path(spec[\"image\"])\n",
     "        payload[\"vision_path\"] = os.path.relpath(image_path, payload_path.parent)\n",
@@ -371,6 +379,8 @@
     "    print(f\"payload: {payload_path}\")\n",
     "    print(f\"output:  {output_dir}\")\n",
     "    print(f\"prompt:  {prompt_path.relative_to(COSMOS_ROOT)}\")\n",
+    "    if spec[\"mode\"] == \"text2image\":\n",
+    "        print(\"note:   TensorRT-LLM Cosmos3 text-to-image is served as a one-frame video response\")\n",
     "    if \"vision_path\" in payload:\n",
     "        image_display_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n",
     "        print(f\"image:   {image_display_path.relative_to(COSMOS_ROOT)}\")\n",
@@ -402,7 +412,6 @@
     "    height, width = payload_dimensions(payload)\n",
     "    body = {\n",
     "        \"prompt\": payload[\"prompt\"],\n",
-    "        \"negative_prompt\": payload[\"negative_prompt\"],\n",
     "        \"size\": f\"{width}x{height}\",\n",
     "        \"seconds\": payload[\"num_frames\"] / payload[\"fps\"],\n",
     "        \"fps\": payload[\"fps\"],\n",
@@ -410,8 +419,9 @@
     "        \"guidance_scale\": payload[\"guidance\"],\n",
     "        \"seed\": payload[\"seed\"],\n",
     "    }\n",
-    "    if INCLUDE_TRTLLM_EXTRA_PARAMS:\n",
-    "        body[\"extra_params\"] = payload[\"extra_params\"]\n",
+    "    if payload.get(\"negative_prompt\") is not None:\n",
+    "        body[\"negative_prompt\"] = payload[\"negative_prompt\"]\n",
+    "    body[\"extra_params\"] = payload[\"extra_params\"]\n",
     "    return body\n",
     "\n",
     "\n",
@@ -550,6 +560,63 @@
    ],
    "id": "nano-title"
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Nano: Text to Image\n",
+    "\n",
+    "Nano text-to-image generation using a structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint.\n"
+   ],
+   "id": "nano-t2i"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t2i_nano_payload, t2i_nano_output, t2i_nano_model = create_payload(\"t2i_nano\")\n"
+   ],
+   "id": "nano-t2i-payload"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run\n"
+   ],
+   "id": "nano-t2i-run-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_trtllm_server(t2i_nano_model)\n",
+    "run_trtllm_payload(t2i_nano_payload, t2i_nano_output, model=t2i_nano_model)\n"
+   ],
+   "id": "nano-t2i-run"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View Results\n"
+   ],
+   "id": "nano-t2i-view-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_run(t2i_nano_output)\n"
+   ],
+   "id": "nano-t2i-view"
+  },
   {
    "cell_type": "markdown",
    "metadata": {},
@@ -674,6 +741,63 @@
    ],
    "id": "super-title"
   },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Super: Text to Image\n",
+    "\n",
+    "Super text-to-image generation using the same structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint.\n"
+   ],
+   "id": "super-t2i"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "t2i_super_payload, t2i_super_output, t2i_super_model = create_payload(\"t2i_super\")\n"
+   ],
+   "id": "super-t2i-payload"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Run\n"
+   ],
+   "id": "super-t2i-run-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "check_trtllm_server(t2i_super_model)\n",
+    "run_trtllm_payload(t2i_super_payload, t2i_super_output, model=t2i_super_model)\n"
+   ],
+   "id": "super-t2i-run"
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### View Results\n"
+   ],
+   "id": "super-t2i-view-title"
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "view_run(t2i_super_output)\n"
+   ],
+   "id": "super-t2i-view"
+  },
   {
    "cell_type": "markdown",
    "metadata": {},

From 0cc506b4ea7560be8d020dd6f3e4a0ab2800df30 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 23 Jun 2026 16:58:14 +0200
Subject: [PATCH 4/6] bugfixed t2i

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 cookbooks/cosmos3/README.md                         |  9 +++++++--
 cookbooks/cosmos3/generator/audiovisual/README.md   |  6 +++---
 .../generator/audiovisual/run_with_trt_llm.ipynb    | 13 ++++++++-----
 3 files changed, 18 insertions(+), 10 deletions(-)

diff --git a/cookbooks/cosmos3/README.md b/cookbooks/cosmos3/README.md
index 3a367cae..9a01bcd8 100644
--- a/cookbooks/cosmos3/README.md
+++ b/cookbooks/cosmos3/README.md
@@ -214,8 +214,13 @@ The server exposes `/health`, `/v1/videos/generations`, `/v1/videos`, and
 `/v1/images/generations`. The audiovisual notebook uses the validated video
 generation endpoint for text-to-image, text-to-video, and image-to-video. Cosmos3
 text-to-image is sent as a one-frame video request, matching the TensorRT-LLM
-Cosmos3 pipeline. Requests send Cosmos3 controls through `extra_params`, so use a
-TensorRT-LLM build that includes the Cosmos3 VisualGen API schema.
+Cosmos3 pipeline; the notebook sends it as `seconds=1` and `fps=1` to satisfy
+the video request schema. Requests send Cosmos3 controls through `extra_params`,
+so use a TensorRT-LLM build that includes the Cosmos3 VisualGen API schema. The
+latest release container is available at
+`nvcr.io/nvidia/tensorrt-llm/release:latest`.
+The notebook sets request-level `max_sequence_length=2048` for longer structured
+JSON prompts.
 
 ## Transformers
 
diff --git a/cookbooks/cosmos3/generator/audiovisual/README.md b/cookbooks/cosmos3/generator/audiovisual/README.md
index ace73c5c..a063abfd 100644
--- a/cookbooks/cosmos3/generator/audiovisual/README.md
+++ b/cookbooks/cosmos3/generator/audiovisual/README.md
@@ -216,6 +216,7 @@ response = requests.post(
         "fps": 24,
         "num_inference_steps": 35,
         "guidance_scale": 6.0,
+        "max_sequence_length": 2048,
         "seed": 0,
         "extra_params": {
             "use_resolution_template": False,
@@ -234,9 +235,8 @@ For image-to-video, post multipart form data to the same endpoint with the
 reference image under `input_reference`. TensorRT-LLM Cosmos3 audio/action
 generation is not covered by this backend section.
 
-For text-to-image, use the same video generation endpoint with `seconds=1 / 24`
-and `fps=24`; TensorRT-LLM Cosmos3 returns a one-frame video response for this
-path.
+For text-to-image, use the same video generation endpoint with `seconds=1` and
+`fps=1`; TensorRT-LLM Cosmos3 returns a one-frame video response for this path.
 
 The TRT-LLM notebook always sends model-specific `extra_params`, so use a
 TensorRT-LLM release with the Cosmos3 VisualGen API schema. The latest release
diff --git a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
index fb18bae8..6b7be68a 100644
--- a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
+++ b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
@@ -27,7 +27,7 @@
    "source": [
     "## 1. Prerequisites\n",
     "\n",
-    "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Generation uses `/v1/videos/generations`; Cosmos3 text-to-image runs as a one-frame VisualGen video request.\n",
+    "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Generation uses `/v1/videos/generations`; Cosmos3 text-to-image runs as a one-frame VisualGen video request with `seconds=1` and `fps=1`.\n",
     "\n",
     "Generator requires the Guardrail. Request access to the gated [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) HF repository before running these examples. TensorRT-LLM loads guardrails by default; to disable them, set `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` before starting the server or set `use_guardrails` to `False` in the request `extra_params`.\n",
     "\n",
@@ -68,7 +68,7 @@
     "torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve   nvidia/Cosmos3-Super   --visual_gen_args \"$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-super-4gpu.yaml\"   --port 8000\n",
     "```\n",
     "\n",
-    "TensorRT-LLM exposes `/health` when the server is ready. This notebook sends Cosmos3 model-specific controls through `extra_params`, so use a TensorRT-LLM release that includes the Cosmos3 VisualGen API schema.\n"
+    "TensorRT-LLM exposes `/health` when the server is ready. This notebook sends Cosmos3 model-specific controls through `extra_params`, so use a TensorRT-LLM release that includes the Cosmos3 VisualGen API schema. The latest release container is available at `nvcr.io/nvidia/tensorrt-llm/release:latest`.\n"
    ],
    "id": "start-server"
   },
@@ -269,6 +269,7 @@
     "    \"guidance\": 6.0,\n",
     "    \"fps\": 24,\n",
     "    \"num_frames\": 189,\n",
+    "    \"max_sequence_length\": 2048,\n",
     "    \"resolution\": \"720\",\n",
     "    \"aspect_ratio\": \"16,9\",\n",
     "    \"seed\": 0,\n",
@@ -363,6 +364,7 @@
     "    }\n",
     "    if spec[\"mode\"] == \"text2image\":\n",
     "        payload[\"num_frames\"] = 1\n",
+    "        payload[\"fps\"] = 1\n",
     "    else:\n",
     "        negative_prompt_path = asset_path(f\"assets/negative_prompts/{spec['mode']}/neg_prompt.json\")\n",
     "        payload[\"negative_prompt\"] = compact_json_file(negative_prompt_path)\n",
@@ -385,7 +387,7 @@
     "        image_display_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n",
     "        print(f\"image:   {image_display_path.relative_to(COSMOS_ROOT)}\")\n",
     "        display(Image(filename=str(image_display_path), width=420))\n",
-    "    print(json.dumps({k: payload[k] for k in [\"model_mode\", \"name\", \"num_steps\", \"guidance\", \"fps\", \"num_frames\", \"resolution\", \"aspect_ratio\", \"seed\", \"extra_params\"]}, indent=2))\n",
+    "    print(json.dumps({k: payload[k] for k in [\"model_mode\", \"name\", \"num_steps\", \"guidance\", \"fps\", \"num_frames\", \"max_sequence_length\", \"resolution\", \"aspect_ratio\", \"seed\", \"extra_params\"]}, indent=2))\n",
     "    return payload_path, output_dir, spec[\"model\"]\n",
     "\n",
     "\n",
@@ -417,6 +419,7 @@
     "        \"fps\": payload[\"fps\"],\n",
     "        \"num_inference_steps\": payload[\"num_steps\"],\n",
     "        \"guidance_scale\": payload[\"guidance\"],\n",
+    "        \"max_sequence_length\": payload[\"max_sequence_length\"],\n",
     "        \"seed\": payload[\"seed\"],\n",
     "    }\n",
     "    if payload.get(\"negative_prompt\") is not None:\n",
@@ -566,7 +569,7 @@
    "source": [
     "## Nano: Text to Image\n",
     "\n",
-    "Nano text-to-image generation using a structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint.\n"
+    "Nano text-to-image generation using a structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint; the request uses `seconds=1` and `fps=1`.\n"
    ],
    "id": "nano-t2i"
   },
@@ -747,7 +750,7 @@
    "source": [
     "## Super: Text to Image\n",
     "\n",
-    "Super text-to-image generation using the same structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint.\n"
+    "Super text-to-image generation using the same structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint; the request uses `seconds=1` and `fps=1`.\n"
    ],
    "id": "super-t2i"
   },

From 314f4c3c04bde75cb6b3936109e902d4f7108bd7 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Tue, 23 Jun 2026 17:36:57 +0200
Subject: [PATCH 5/6] more bugfixes

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 cookbooks/cosmos3/README.md                   |  7 ++---
 .../cosmos3/generator/audiovisual/README.md   |  8 ++++--
 .../audiovisual/run_with_trt_llm.ipynb        | 28 ++++++++++---------
 3 files changed, 23 insertions(+), 20 deletions(-)

diff --git a/cookbooks/cosmos3/README.md b/cookbooks/cosmos3/README.md
index 9a01bcd8..bc952423 100644
--- a/cookbooks/cosmos3/README.md
+++ b/cookbooks/cosmos3/README.md
@@ -214,13 +214,12 @@ The server exposes `/health`, `/v1/videos/generations`, `/v1/videos`, and
 `/v1/images/generations`. The audiovisual notebook uses the validated video
 generation endpoint for text-to-image, text-to-video, and image-to-video. Cosmos3
 text-to-image is sent as a one-frame video request, matching the TensorRT-LLM
-Cosmos3 pipeline; the notebook sends it as `seconds=1` and `fps=1` to satisfy
-the video request schema. Requests send Cosmos3 controls through `extra_params`,
+Cosmos3 pipeline; the notebook sends it as `num_frames=1`, `seconds=1`, and
+`fps=8` to satisfy the video request schema while preserving a single generated
+frame. Requests send Cosmos3 controls through `extra_params`,
 so use a TensorRT-LLM build that includes the Cosmos3 VisualGen API schema. The
 latest release container is available at
 `nvcr.io/nvidia/tensorrt-llm/release:latest`.
-The notebook sets request-level `max_sequence_length=2048` for longer structured
-JSON prompts.
 
 ## Transformers
 
diff --git a/cookbooks/cosmos3/generator/audiovisual/README.md b/cookbooks/cosmos3/generator/audiovisual/README.md
index a063abfd..197076fe 100644
--- a/cookbooks/cosmos3/generator/audiovisual/README.md
+++ b/cookbooks/cosmos3/generator/audiovisual/README.md
@@ -214,9 +214,9 @@ response = requests.post(
         "size": "1280x720",
         "seconds": 189 / 24,
         "fps": 24,
+        "num_frames": 189,
         "num_inference_steps": 35,
         "guidance_scale": 6.0,
-        "max_sequence_length": 2048,
         "seed": 0,
         "extra_params": {
             "use_resolution_template": False,
@@ -235,8 +235,10 @@ For image-to-video, post multipart form data to the same endpoint with the
 reference image under `input_reference`. TensorRT-LLM Cosmos3 audio/action
 generation is not covered by this backend section.
 
-For text-to-image, use the same video generation endpoint with `seconds=1` and
-`fps=1`; TensorRT-LLM Cosmos3 returns a one-frame video response for this path.
+For text-to-image, use the same video generation endpoint with `num_frames=1`,
+`seconds=1`, and `fps=8`; TensorRT-LLM Cosmos3 returns a one-frame video
+response for this path. `num_frames` is passed explicitly so the server does not
+derive an eight-frame clip from `seconds * fps`.
 
 The TRT-LLM notebook always sends model-specific `extra_params`, so use a
 TensorRT-LLM release with the Cosmos3 VisualGen API schema. The latest release
diff --git a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
index 6b7be68a..7b4e2286 100644
--- a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
+++ b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
@@ -27,7 +27,7 @@
    "source": [
     "## 1. Prerequisites\n",
     "\n",
-    "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Generation uses `/v1/videos/generations`; Cosmos3 text-to-image runs as a one-frame VisualGen video request with `seconds=1` and `fps=1`.\n",
+    "Use a running TensorRT-LLM server with Cosmos3 VisualGen support and set endpoint environment variables before the setup cell if you are not using the local default. Generation uses `/v1/videos/generations`; Cosmos3 text-to-image runs as a one-frame VisualGen video request with `num_frames=1`, `seconds=1`, and `fps=8`.\n",
     "\n",
     "Generator requires the Guardrail. Request access to the gated [nvidia/Cosmos-1.0-Guardrail](https://huggingface.co/nvidia/Cosmos-1.0-Guardrail) HF repository before running these examples. TensorRT-LLM loads guardrails by default; to disable them, set `TRTLLM_DISABLE_COSMOS3_GUARDRAILS=1` before starting the server or set `use_guardrails` to `False` in the request `extra_params`.\n",
     "\n",
@@ -269,7 +269,6 @@
     "    \"guidance\": 6.0,\n",
     "    \"fps\": 24,\n",
     "    \"num_frames\": 189,\n",
-    "    \"max_sequence_length\": 2048,\n",
     "    \"resolution\": \"720\",\n",
     "    \"aspect_ratio\": \"16,9\",\n",
     "    \"seed\": 0,\n",
@@ -281,7 +280,6 @@
     "    \"use_system_prompt\": False,\n",
     "    \"use_guardrails\": True,\n",
     "}\n",
-    "\n",
     "# TensorRT-LLM Cosmos3 uses the stable video endpoint path here. Text-to-image\n",
     "# generation is represented as a one-frame video request.\n",
     "ASSET_SETS = {\n",
@@ -298,8 +296,8 @@
     "    \"i2v_nano\": {\n",
     "        \"model\": \"Cosmos3-Nano\",\n",
     "        \"mode\": \"image2video\",\n",
-    "        \"prompt\": \"assets/prompts/image2video/car_driving.json\",\n",
-    "        \"image\": \"assets/images/image2video/car_driving.jpg\",\n",
+    "        \"prompt\": \"assets/prompts/image2video/humanoid_robot.json\",\n",
+    "        \"image\": \"assets/images/image2video/humanoid_robot.jpg\",\n",
     "    },\n",
     "    \"t2i_super\": {\n",
     "        \"model\": \"Cosmos3-Super\",\n",
@@ -314,8 +312,8 @@
     "    \"i2v_super\": {\n",
     "        \"model\": \"Cosmos3-Super\",\n",
     "        \"mode\": \"image2video\",\n",
-    "        \"prompt\": \"assets/prompts/image2video/car_driving.json\",\n",
-    "        \"image\": \"assets/images/image2video/car_driving.jpg\",\n",
+    "        \"prompt\": \"assets/prompts/image2video/humanoid_robot.json\",\n",
+    "        \"image\": \"assets/images/image2video/humanoid_robot.jpg\",\n",
     "    },\n",
     "}\n",
     "\n",
@@ -364,7 +362,8 @@
     "    }\n",
     "    if spec[\"mode\"] == \"text2image\":\n",
     "        payload[\"num_frames\"] = 1\n",
-    "        payload[\"fps\"] = 1\n",
+    "        payload[\"fps\"] = 8\n",
+    "        payload[\"seconds\"] = 1\n",
     "    else:\n",
     "        negative_prompt_path = asset_path(f\"assets/negative_prompts/{spec['mode']}/neg_prompt.json\")\n",
     "        payload[\"negative_prompt\"] = compact_json_file(negative_prompt_path)\n",
@@ -387,7 +386,10 @@
     "        image_display_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n",
     "        print(f\"image:   {image_display_path.relative_to(COSMOS_ROOT)}\")\n",
     "        display(Image(filename=str(image_display_path), width=420))\n",
-    "    print(json.dumps({k: payload[k] for k in [\"model_mode\", \"name\", \"num_steps\", \"guidance\", \"fps\", \"num_frames\", \"max_sequence_length\", \"resolution\", \"aspect_ratio\", \"seed\", \"extra_params\"]}, indent=2))\n",
+    "    preview_keys = [\"model_mode\", \"name\", \"num_steps\", \"guidance\", \"fps\", \"num_frames\", \"resolution\", \"aspect_ratio\", \"seed\", \"extra_params\"]\n",
+    "    if \"seconds\" in payload:\n",
+    "        preview_keys.insert(5, \"seconds\")\n",
+    "    print(json.dumps({k: payload[k] for k in preview_keys}, indent=2))\n",
     "    return payload_path, output_dir, spec[\"model\"]\n",
     "\n",
     "\n",
@@ -415,11 +417,11 @@
     "    body = {\n",
     "        \"prompt\": payload[\"prompt\"],\n",
     "        \"size\": f\"{width}x{height}\",\n",
-    "        \"seconds\": payload[\"num_frames\"] / payload[\"fps\"],\n",
+    "        \"seconds\": payload.get(\"seconds\", payload[\"num_frames\"] / payload[\"fps\"]),\n",
     "        \"fps\": payload[\"fps\"],\n",
+    "        \"num_frames\": payload[\"num_frames\"],\n",
     "        \"num_inference_steps\": payload[\"num_steps\"],\n",
     "        \"guidance_scale\": payload[\"guidance\"],\n",
-    "        \"max_sequence_length\": payload[\"max_sequence_length\"],\n",
     "        \"seed\": payload[\"seed\"],\n",
     "    }\n",
     "    if payload.get(\"negative_prompt\") is not None:\n",
@@ -569,7 +571,7 @@
    "source": [
     "## Nano: Text to Image\n",
     "\n",
-    "Nano text-to-image generation using a structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint; the request uses `seconds=1` and `fps=1`.\n"
+    "Nano text-to-image generation using a structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint; the request uses `num_frames=1`, `seconds=1`, and `fps=8`.\n"
    ],
    "id": "nano-t2i"
   },
@@ -750,7 +752,7 @@
    "source": [
     "## Super: Text to Image\n",
     "\n",
-    "Super text-to-image generation using the same structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint; the request uses `seconds=1` and `fps=1`.\n"
+    "Super text-to-image generation using the same structured JSON prompt. TensorRT-LLM Cosmos3 returns this as a one-frame video from the video generation endpoint; the request uses `num_frames=1`, `seconds=1`, and `fps=8`.\n"
    ],
    "id": "super-t2i"
   },

From 72693a368685e02e8f72dcdc4f6e52a4f235be59 Mon Sep 17 00:00:00 2001
From: Maciej Bala <mbala@nvidia.com>
Date: Wed, 24 Jun 2026 09:41:36 +0200
Subject: [PATCH 6/6] Updated install instructions

Signed-off-by: Maciej Bala <mbala@nvidia.com>
---
 cookbooks/cosmos3/README.md                   | 41 ++++++++++++++++---
 .../cosmos3/generator/audiovisual/README.md   |  5 ++-
 .../audiovisual/run_with_trt_llm.ipynb        |  8 ++--
 3 files changed, 43 insertions(+), 11 deletions(-)

diff --git a/cookbooks/cosmos3/README.md b/cookbooks/cosmos3/README.md
index bc952423..7aea48cf 100644
--- a/cookbooks/cosmos3/README.md
+++ b/cookbooks/cosmos3/README.md
@@ -170,13 +170,42 @@ text-to-video, and image-to-video examples. Cosmos3 support was added in TensorR
 [#14824](https://github.com/NVIDIA/TensorRT-LLM/pull/14824); use a
 TensorRT-LLM checkout or package that includes that change.
 
-Install TensorRT-LLM following its upstream documentation, or run the latest
-release container instead of installing it manually:
+Install TensorRT-LLM following its upstream documentation.
+
+To build TensorRT-LLM from source, follow NVIDIA's
+[Build from Source](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source.html)
+guide. This is the right path when you need a checkout that contains a recent
+Cosmos3 VisualGen change before it is available in your installed package or
+release image.
 
 ```bash
-docker pull nvcr.io/nvidia/tensorrt-llm/release:latest
+apt-get update && apt-get -y install git git-lfs
+git lfs install
+
+git clone https://github.com/NVIDIA/TensorRT-LLM.git
+cd TensorRT-LLM
+git submodule update --init --recursive
+git lfs pull
+
+# Pick a devel tag from the upstream build-from-source guide or NGC.
+docker pull nvcr.io/nvidia/tensorrt-llm/devel:<tag>
+docker run --rm -it \
+  --ipc=host \
+  --ulimit memlock=-1 --ulimit stack=67108864 \
+  --gpus=all \
+  --volume "$PWD":"$PWD" \
+  --workdir "$PWD" \
+  nvcr.io/nvidia/tensorrt-llm/devel:<tag>
+
+# Inside the container:
+python3 scripts/build_wheel.py --use_ccache --skip_building_wheel --linking_install_binary
+pip install -e .
 ```
 
+For Python-only changes, the upstream guide also documents
+`TRTLLM_USE_PRECOMPILED=1 pip install -e .` to reuse precompiled binaries while
+installing the checkout in editable mode.
+
 Then install the Cosmos3 guardrail package in the same environment unless you
 explicitly disable guardrails before starting the server:
 
@@ -217,9 +246,9 @@ text-to-image is sent as a one-frame video request, matching the TensorRT-LLM
 Cosmos3 pipeline; the notebook sends it as `num_frames=1`, `seconds=1`, and
 `fps=8` to satisfy the video request schema while preserving a single generated
 frame. Requests send Cosmos3 controls through `extra_params`,
-so use a TensorRT-LLM build that includes the Cosmos3 VisualGen API schema. The
-latest release container is available at
-`nvcr.io/nvidia/tensorrt-llm/release:latest`.
+so use a TensorRT-LLM build that includes the Cosmos3 VisualGen API schema.
+The notebook sets request-level `max_sequence_length=2048` for longer structured
+JSON prompts.
 
 ## Transformers
 
diff --git a/cookbooks/cosmos3/generator/audiovisual/README.md b/cookbooks/cosmos3/generator/audiovisual/README.md
index 197076fe..b74ea26c 100644
--- a/cookbooks/cosmos3/generator/audiovisual/README.md
+++ b/cookbooks/cosmos3/generator/audiovisual/README.md
@@ -217,6 +217,7 @@ response = requests.post(
         "num_frames": 189,
         "num_inference_steps": 35,
         "guidance_scale": 6.0,
+        "max_sequence_length": 2048,
         "seed": 0,
         "extra_params": {
             "use_resolution_template": False,
@@ -241,8 +242,8 @@ response for this path. `num_frames` is passed explicitly so the server does not
 derive an eight-frame clip from `seconds * fps`.
 
 The TRT-LLM notebook always sends model-specific `extra_params`, so use a
-TensorRT-LLM release with the Cosmos3 VisualGen API schema. The latest release
-container is available at `nvcr.io/nvidia/tensorrt-llm/release:latest`.
+TensorRT-LLM release with the Cosmos3 VisualGen API schema. The notebook sets
+request-level `max_sequence_length=2048` for longer structured JSON prompts.
 
 ### Notebook walkthrough
 
diff --git a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
index 7b4e2286..df6260e7 100644
--- a/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
+++ b/cookbooks/cosmos3/generator/audiovisual/run_with_trt_llm.ipynb
@@ -46,7 +46,7 @@
    "source": [
     "## 2. Start the Server\n",
     "\n",
-    "Run the TensorRT-LLM VisualGen server before running the request cells. The config YAMLs below come from TensorRT-LLM's Cosmos3 support. Instead of installing TensorRT-LLM manually, you can use the latest release container: `nvcr.io/nvidia/tensorrt-llm/release:latest`.\n",
+    "Run the TensorRT-LLM VisualGen server before running the request cells. The config YAMLs below come from TensorRT-LLM's Cosmos3 support. To build a checkout from source, follow NVIDIA's [Build from Source](https://nvidia.github.io/TensorRT-LLM/installation/build-from-source.html) guide and then run the commands below from that checkout.\n",
     "\n",
     "### Cosmos3-Nano\n",
     "\n",
@@ -68,7 +68,7 @@
     "torchrun --nproc_per_node=4 -m tensorrt_llm.commands.serve   nvidia/Cosmos3-Super   --visual_gen_args \"$TRTLLM_ROOT/examples/visual_gen/configs/cosmos3-super-4gpu.yaml\"   --port 8000\n",
     "```\n",
     "\n",
-    "TensorRT-LLM exposes `/health` when the server is ready. This notebook sends Cosmos3 model-specific controls through `extra_params`, so use a TensorRT-LLM release that includes the Cosmos3 VisualGen API schema. The latest release container is available at `nvcr.io/nvidia/tensorrt-llm/release:latest`.\n"
+    "TensorRT-LLM exposes `/health` when the server is ready. This notebook sends Cosmos3 model-specific controls through `extra_params`, so use a TensorRT-LLM release that includes the Cosmos3 VisualGen API schema.\n"
    ],
    "id": "start-server"
   },
@@ -269,6 +269,7 @@
     "    \"guidance\": 6.0,\n",
     "    \"fps\": 24,\n",
     "    \"num_frames\": 189,\n",
+    "    \"max_sequence_length\": 2048,\n",
     "    \"resolution\": \"720\",\n",
     "    \"aspect_ratio\": \"16,9\",\n",
     "    \"seed\": 0,\n",
@@ -386,7 +387,7 @@
     "        image_display_path = resolve_payload_path(payload_path, payload[\"vision_path\"])\n",
     "        print(f\"image:   {image_display_path.relative_to(COSMOS_ROOT)}\")\n",
     "        display(Image(filename=str(image_display_path), width=420))\n",
-    "    preview_keys = [\"model_mode\", \"name\", \"num_steps\", \"guidance\", \"fps\", \"num_frames\", \"resolution\", \"aspect_ratio\", \"seed\", \"extra_params\"]\n",
+    "    preview_keys = [\"model_mode\", \"name\", \"num_steps\", \"guidance\", \"fps\", \"num_frames\", \"max_sequence_length\", \"resolution\", \"aspect_ratio\", \"seed\", \"extra_params\"]\n",
     "    if \"seconds\" in payload:\n",
     "        preview_keys.insert(5, \"seconds\")\n",
     "    print(json.dumps({k: payload[k] for k in preview_keys}, indent=2))\n",
@@ -422,6 +423,7 @@
     "        \"num_frames\": payload[\"num_frames\"],\n",
     "        \"num_inference_steps\": payload[\"num_steps\"],\n",
     "        \"guidance_scale\": payload[\"guidance\"],\n",
+    "        \"max_sequence_length\": payload[\"max_sequence_length\"],\n",
     "        \"seed\": payload[\"seed\"],\n",
     "    }\n",
     "    if payload.get(\"negative_prompt\") is not None:\n",