diff --git a/.github/workflows/gpu-tests.yml b/.github/workflows/gpu-tests.yml index 3a92614..72ff4f4 100644 --- a/.github/workflows/gpu-tests.yml +++ b/.github/workflows/gpu-tests.yml @@ -3,14 +3,15 @@ # Cosmos3-Nano GPU test suite on a self-hosted 8×H200 runner. # -# A single ``pre-commit`` (lint) job runs first; the four GPU jobs all +# A single ``pre-commit`` (lint) job runs first; the five GPU jobs all # ``needs:`` it, so they wait on ONE pre-commit run and are skipped if lint # fails — the single self-hosted runner is never spent on a lint-failing commit. -# The four GPU jobs then run (one at a time on the single runner): -# * training-smoke — Nano SFT pipeline (convert -> train 5 -> export -> t2i) -# * generator-regression — vision_sft_nano loss vs goldens (4-GPU subset) -# * inference-smoke — Nano multi-modality inference (t2vs + policy + forward_dynamics) -# * reasoner-regression — llava_ov loss vs goldens (4-GPU subset) +# The five GPU jobs then run (one at a time on the single runner): +# * training-smoke — Nano SFT pipeline (convert -> train 5 -> export -> t2i) +# * generator-training-regression — vision_sft_nano loss vs goldens (4-GPU subset) +# * generator-inference-smoke — Nano multi-modality inference (t2vs + policy + forward_dynamics) +# * reasoner-inference-smoke — Nano reasoner inference first-token logits vs golden (image-conditioned, 4-GPU) +# * reasoner-training-regression — llava_ov loss vs goldens (4-GPU subset) # # Requires: # * a self-hosted runner labelled [self-hosted, gpu, h200] with 8 GPUs, @@ -72,7 +73,7 @@ jobs: rm -rf examples/checkpoints || true rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true - generator-regression: + generator-training-regression: needs: pre-commit runs-on: [self-hosted, gpu, h200] timeout-minutes: 60 @@ -104,7 +105,7 @@ jobs: run: | rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true - inference-smoke: + generator-inference-smoke: needs: pre-commit runs-on: [self-hosted, gpu, h200] timeout-minutes: 60 @@ -136,7 +137,46 @@ jobs: run: | rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true - reasoner-regression: + reasoner-inference-smoke: + needs: pre-commit + runs-on: [self-hosted, gpu, h200] + timeout-minutes: 60 + env: + HF_TOKEN: ${{ secrets.HF_TOKEN }} + HF_HUB_DISABLE_XET: "1" + # 4-GPU reasoner inference test (uses 4 of the 8 GPUs); the gpus(4) marker + # requires MAX_GPUS == 4. + TEST_MAX_GPUS: "4" + steps: + - uses: actions/checkout@v6 + + - uses: astral-sh/setup-uv@v7 + + - name: Sync environment (cu128-train) + run: uv sync --all-extras --group=cu128-train + + # Image-conditioned Cosmos3-Nano reasoner inference: compare the first + # decoded token's logits against a committed golden (exact argmax + + # allclose rtol/atol=1e-3). -s streams the live process log. The text-only + # variant lives in the same file but is not exercised in CI. Cache the + # downloaded image asset (reasoner_image.json's remote vision_path) in the + # persistent input-asset dir shared with the generator-inference-smoke job. + - name: Nano reasoner inference (image-conditioned first-token logits golden, 4 GPU) + run: | + export LD_LIBRARY_PATH= + export COSMOS_DOWNLOAD_CACHE_DIR="$RUNNER_WORKSPACE/cosmos_input_cache" + uv run --all-extras --group=cu128-train python -m pytest -v -s \ + tests/nano_reasoner_inference_smoke_test.py::test_nano_reasoner_image_first_token_logits \ + --num-gpus=4 --levels=2 -o addopts= + + # Reasoner inference writes only the pytest tmp dir (generated text + logs); + # the checkpoint download stays in the HF cache (kept). + - name: Clean up run outputs + if: always() + run: | + rm -rf "${TMPDIR:-/tmp}"/pytest-of-* /tmp/pytest-of-* || true + + reasoner-training-regression: needs: pre-commit runs-on: [self-hosted, gpu, h200] timeout-minutes: 60 @@ -170,7 +210,7 @@ jobs: # Co-located unit tests: every *_test.py under cosmos_framework/ (CPU and GPU # together) in one pytest invocation, plus two torchrun steps for the - # distributed tests that hardcode their world size. Runs parallel to the four + # distributed tests that hardcode their world size. Runs parallel to the five # jobs above (all gated on the single pre-commit lint). unittest: needs: pre-commit diff --git a/tests/_reasoner_logits_probe.py b/tests/_reasoner_logits_probe.py new file mode 100644 index 0000000..8830cb6 --- /dev/null +++ b/tests/_reasoner_logits_probe.py @@ -0,0 +1,74 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""torchrun entry that captures the reasoner's first-token logits, then runs +the normal inference CLI. + +Launched by ``nano_reasoner_inference_smoke_test.py`` via:: + + REASONER_LOGITS_DUMP= torchrun --nproc_per_node=4 \ + tests/_reasoner_logits_probe.py + +It (1) pins deterministic kernels so the first-token logits are reproducible +run-to-run on the same GPU config, (2) monkey-patches the module-global +``unified_mot._sample_next_token`` so its FIRST invocation (global rank 0) +saves the first-token logits to ``$REASONER_LOGITS_DUMP``, then (3) forwards +``sys.argv`` to ``cosmos_framework.scripts.inference.main``. + +Greedy decode (the reasoner default ``do_sample=false``) consumes no sampling +RNG, so the saved logits depend only on the checkpoint + prompt + kernels. +""" + +from __future__ import annotations + +import os + + +def _install_determinism() -> None: + # Must be set before the first cuBLAS call for deterministic GEMMs. + os.environ.setdefault("CUBLAS_WORKSPACE_CONFIG", ":4096:8") + # Make flash-attention kernels deterministic (no atomic-add reductions). + os.environ.setdefault("FLASH_ATTENTION_DETERMINISTIC", "1") + + import torch + + # warn_only: degrade (don't crash) on any op lacking a deterministic impl. + torch.use_deterministic_algorithms(True, warn_only=True) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + +def _install_logits_probe(dump_path: str) -> None: + import torch + + import cosmos_framework.model.vfm.mot.unified_mot as unified_mot + + original = unified_mot._sample_next_token + state = {"saved": False} + + def _patched(logits, *args, **kwargs): + # ``logits`` is [B, vocab] for the token being sampled. The first call + # in a generation run is the first decoded token after the prompt. + if not state["saved"]: + state["saved"] = True + if int(os.environ.get("RANK", "0")) == 0: + os.makedirs(os.path.dirname(dump_path), exist_ok=True) + # Sample 0, full vocab, fp32 on CPU — stable to torch.load anywhere. + torch.save(logits[0].detach().float().cpu(), dump_path) + return original(logits, *args, **kwargs) + + unified_mot._sample_next_token = _patched + + +def main() -> None: + _install_determinism() + dump_path = os.environ["REASONER_LOGITS_DUMP"] + _install_logits_probe(dump_path) + + from cosmos_framework.scripts.inference import main as inference_main + + inference_main() + + +if __name__ == "__main__": + main() diff --git a/tests/data/nano_reasoner_inference_smoke_test/first_token_logits_golden.pt b/tests/data/nano_reasoner_inference_smoke_test/first_token_logits_golden.pt new file mode 100644 index 0000000..07c4f27 Binary files /dev/null and b/tests/data/nano_reasoner_inference_smoke_test/first_token_logits_golden.pt differ diff --git a/tests/data/nano_reasoner_inference_smoke_test/first_token_logits_image_golden.pt b/tests/data/nano_reasoner_inference_smoke_test/first_token_logits_image_golden.pt new file mode 100644 index 0000000..6143165 Binary files /dev/null and b/tests/data/nano_reasoner_inference_smoke_test/first_token_logits_image_golden.pt differ diff --git a/tests/nano_reasoner_inference_smoke_test.py b/tests/nano_reasoner_inference_smoke_test.py new file mode 100644 index 0000000..7958d99 --- /dev/null +++ b/tests/nano_reasoner_inference_smoke_test.py @@ -0,0 +1,192 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: OpenMDW-1.1 + +"""4-GPU reasoner inference test for Cosmos3-Nano. + +Two cases, each a separate ``cosmos_framework.scripts.inference`` torchrun +launched through ``tests/_reasoner_logits_probe.py`` (which pins deterministic +kernels and captures the first decoded token's logits on rank 0): + + * ``test_nano_reasoner_first_token_logits`` — text-only reasoner inference + (``inputs/reasoner/reasoner.json``). + * ``test_nano_reasoner_image_first_token_logits`` — image-conditioned reasoner + inference (``inputs/reasoner/reasoner_image.json``). + +Each asserts a non-empty ``reasoner_text`` was produced AND compares the +captured first-token logits against its own committed golden tensor: exact +argmax match + ``torch.allclose(rtol=1e-3, atol=1e-3)``. Determinism is pinned +in the probe (greedy decode, deterministic cuBLAS/cuDNN/flash-attn, fixed seed), +so a clean run reproduces the golden run-to-run on the same 4-GPU config. + +Goldens (one per case):: + + tests/data/nano_reasoner_inference_smoke_test/first_token_logits_golden.pt + tests/data/nano_reasoner_inference_smoke_test/first_token_logits_image_golden.pt + +Golden bootstrap: on the first run a golden does not exist; the test writes the +captured tensor next to the golden path (``*_golden`` suffix dropped) and skips +with instructions to rename it to the golden name and commit. Subsequent runs +compare against the committed golden. + +Invocation (inside the inference container, from the repo root, on a >=4-GPU +node):: + + TEST_MAX_GPUS=4 pytest -s tests/nano_reasoner_inference_smoke_test.py \ + --num-gpus=4 --levels=2 -o addopts= + +Without ``--num-gpus``/``--levels`` (e.g. the no-GPU pre-commit CI) the test is +not collected. +""" + +import json +import os +import shutil +import socket +import subprocess +import sys +from pathlib import Path + +import pytest + +from cosmos_framework.inference.fixtures.args import MAX_GPUS + +REPO_ROOT = Path(__file__).resolve().parents[1] + +# Goldens live under the repo's tests/data/ convention, one per case. +_GOLDEN_DIR = REPO_ROOT / "tests" / "data" / "nano_reasoner_inference_smoke_test" +_TEXT_GOLDEN = _GOLDEN_DIR / "first_token_logits_golden.pt" +_IMAGE_GOLDEN = _GOLDEN_DIR / "first_token_logits_image_golden.pt" + +# Tight tolerance — the probe pins deterministic kernels + greedy decode. +_RTOL = 1e-3 +_ATOL = 1e-3 + + +def _free_port() -> int: + """Return a currently-free TCP port for torchrun's rendezvous.""" + with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: + s.bind(("", 0)) + return s.getsockname()[1] + + +def _run(cmd: list[str], log_file: Path, extra_env: dict[str, str] | None = None) -> str: + """Run ``cmd`` from the repo root, tee combined output (live under ``-s`` + + into ``log_file``). Inherits the caller's env plus ``PYTHONPATH=.`` and any + ``extra_env``. Fails with the log tail on a non-zero exit.""" + env = os.environ.copy() + env["PYTHONPATH"] = f".:{env.get('PYTHONPATH', '')}" + if extra_env: + env.update(extra_env) + log_file.parent.mkdir(parents=True, exist_ok=True) + captured: list[str] = [] + with log_file.open("w") as fp: + proc = subprocess.Popen( + cmd, env=env, cwd=str(REPO_ROOT), + stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True, bufsize=1, + ) + assert proc.stdout is not None + for line in proc.stdout: + sys.stdout.write(line) + sys.stdout.flush() + fp.write(line) + captured.append(line) + returncode = proc.wait() + text = "".join(captured) + if returncode != 0: + pytest.fail(f"inference failed with exit code {returncode}:\n {' '.join(cmd)}\nLog tail:\n{text[-3000:]}") + return text + + +def _reasoner_text(out_dir: Path) -> str: + """Read the single sample's ``reasoner_text`` from ``sample_outputs.json``.""" + results = sorted(out_dir.rglob("sample_outputs.json")) + assert len(results) == 1, f"expected one sample_outputs.json, found {[str(p) for p in results]}" + content = json.loads(results[0].read_text())["outputs"][0]["content"] + text = content.get("reasoner_text") if isinstance(content, dict) else None + assert isinstance(text, str) and text.strip(), f"empty/missing reasoner_text in {results[0]}: {content!r}" + return text + + +def _run_reasoner_probe(tmp_path: Path, input_json: str) -> Path: + """Launch a 4-GPU reasoner inference for ``input_json`` through the logits + probe; assert a non-empty ``reasoner_text``; return the dumped logits path.""" + out_dir = tmp_path / "out" + dump = tmp_path / "first_token_logits.pt" + cmd = [ + "torchrun", + "--nproc_per_node=4", + f"--master_port={_free_port()}", + "tests/_reasoner_logits_probe.py", + "--parallelism-preset=throughput", + "-i", + input_json, + "-o", + str(out_dir), + "--checkpoint-path", + "Cosmos3-Nano", + "--seed=0", + ] + _run(cmd, tmp_path / "inference.log", extra_env={"REASONER_LOGITS_DUMP": str(dump)}) + _reasoner_text(out_dir) + assert dump.is_file(), f"probe did not write first-token logits to {dump}" + return dump + + +def _assert_matches_golden(dump: Path, golden_path: Path) -> None: + """Compare captured logits to ``golden_path``: exact argmax + tight allclose. + + On the first run (no golden) stage the candidate next to the golden path + (``*_golden`` suffix dropped) and skip with rename instructions. + """ + import torch + + current = torch.load(dump) + if not golden_path.is_file(): + golden_path.parent.mkdir(parents=True, exist_ok=True) + candidate = golden_path.with_name(golden_path.name.replace("_golden", "")) + shutil.copyfile(dump, candidate) + pytest.skip(f"golden created at {candidate}; rename to {golden_path.name} and commit, then re-run") + + ref = torch.load(golden_path) + assert current.shape == ref.shape, f"logits shape {tuple(current.shape)} != golden {tuple(ref.shape)}" + # Hard gate: the greedily-predicted first token must match exactly. + assert int(current.argmax()) == int(ref.argmax()), ( + f"first-token argmax {int(current.argmax())} != golden {int(ref.argmax())}" + ) + # Sensitive gate: full logits within tight tolerance. + assert torch.allclose(current, ref, rtol=_RTOL, atol=_ATOL), ( + f"first-token logits differ from golden beyond rtol={_RTOL}, atol={_ATOL}; " + f"max|Δ|={float((current - ref).abs().max()):.3e}" + ) + + +@pytest.fixture(scope="module", autouse=True) +def _require_4_gpus() -> None: + """Skip the module unless we can launch a 4-GPU run here.""" + if shutil.which("torchrun") is None: + pytest.skip("torchrun not on PATH -- must run inside the inference container") + try: + import torch + except Exception as exc: # pragma: no cover -- surfaces during dev only + pytest.skip(f"torch unavailable ({exc!r})") + if not torch.cuda.is_available() or torch.cuda.device_count() < 4: + pytest.skip(f"requires 4 visible CUDA devices, found {torch.cuda.device_count()}") + + +# Defined only when the active MAX_GPUS is 4 -- the conftest rejects ``gpus(N)`` +# markers outside ``ALL_NUM_GPUS = (0, 1, MAX_GPUS)``. Run with TEST_MAX_GPUS=4. +if MAX_GPUS == 4: + + @pytest.mark.level(2) + @pytest.mark.gpus(4) + def test_nano_reasoner_first_token_logits(tmp_path: Path) -> None: + """Text-only reasoner inference; reasoner_text + golden first-token logits.""" + dump = _run_reasoner_probe(tmp_path, "inputs/reasoner/reasoner.json") + _assert_matches_golden(dump, _TEXT_GOLDEN) + + @pytest.mark.level(2) + @pytest.mark.gpus(4) + def test_nano_reasoner_image_first_token_logits(tmp_path: Path) -> None: + """Image-conditioned reasoner inference; reasoner_text + golden first-token logits.""" + dump = _run_reasoner_probe(tmp_path, "inputs/reasoner/reasoner_image.json") + _assert_matches_golden(dump, _IMAGE_GOLDEN)