From 66e088135063f92c4a4e9583c7428f22783c5830 Mon Sep 17 00:00:00 2001 From: "tianle.zhong" Date: Thu, 21 May 2026 00:30:45 +0000 Subject: [PATCH 1/4] [deps] feat: bump to transformers v5.2.0 + VeOmni a4ed599 VeOmni's main branch now defaults to transformers v5.2.0 (PR #751, gated via the `transformers-stable` dependency group); mirror that pin in vexact so users on the veomni extra resolve to the same version VeOmni tests/develops against. Changes: - veomni rev: 58759e7 -> a4ed599. Picks up the v5 default plus Qwen3-VL/Omni-MoE CPU-sync removals (#762, #764), v5 loader test (#727), MoE router replay hook (#719), and the v4 cleanup (#768). - vllm: 0.18.0 -> 0.19.1 (latest vllm still on torch 2.10; 0.20+ would cascade a torch 2.11 + flash-attn-wheel bump). - transformers pinned to 5.2.0 inside the `veomni` extra so it only hits users actually doing veomni-based training. - override-dependencies forces transformers==5.2.0 globally so the `vllm` extra (whose metadata still excludes 5.0.*-5.4.* until the vllm devs whitelist 5.5.1+) can coexist in the resolution. Smoke verification (1x8 H100, Qwen3-1.7B + gsm8k, 2 training steps): step:1 rollout_probs_diff_max=0.0 pearson_corr=0.9999999 actor/entropy=0.180 k3_kl=0.0 step:2 rollout_probs_diff_max=0.0 pearson_corr=1.0 actor/entropy=0.176 k3_kl=0.0 Bitwise actor<->rollout alignment is preserved under transformers v5. The 30B-A3B B200 recipe was not re-validated on H100 here because its `max_cache_blocks=4608` is sized for 192GB HBM and OOMs on 80GB H100; alignment math is shared with the 1.7B path, so v5 risk is contained to the import/dataclass surface (already covered by the 1.7B smoke). --- pyproject.toml | 14 ++++++++-- uv.lock | 70 ++++++++++++++++++++++++++++++++++---------------- 2 files changed, 60 insertions(+), 24 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 539ae0e..196b2aa 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -27,7 +27,7 @@ dev = [ "pre-commit" ] vllm = [ - "vllm==0.18.0", + "vllm==0.19.1", ] verl = [ "verl", @@ -37,6 +37,11 @@ verl = [ ] veomni = [ "veomni", + # VeOmni's default install pins transformers==5.2.0 (via its + # `transformers-stable` dependency group). Mirror that pin here so vexact + # users picking up the veomni extra resolve to the same version VeOmni + # tests/develops against. + "transformers==5.2.0", ] [build-system] @@ -125,7 +130,7 @@ known-third-party = [ verl = { git = "https://github.com/verl-project/verl.git", rev = "61f29997fb026a5a269dafccfe2f3bb800e32ef4" } # To work on verl locally, point this at `{ path = "./verl", editable = true }`. # To work on VeOmni locally, point this at `{ path = "./VeOmni", editable = true }`. -veomni = { git = "https://github.com/ByteDance-Seed/VeOmni.git", rev = "58759e78015ad429507079aa443215e3c515364f" } +veomni = { git = "https://github.com/ByteDance-Seed/VeOmni.git", rev = "a4ed599119afb21f5e559f15e95635f0edbbc5c6" } torch = [ { index = "pytorch", extra = "gpu" }, ] @@ -150,4 +155,9 @@ no-build-isolation-package = ["flash-attn"] environments = ["sys_platform == 'linux'"] override-dependencies = [ "opencv-python-headless<4.13.0", + # vllm 0.19.1's metadata still excludes transformers 5.0.*-5.4.* (only + # 5.5.1+ is whitelisted), but VeOmni pins transformers==5.2.0. Override + # vllm's conservative ceiling so the `vllm` and `veomni` extras can + # coexist; vllm 0.19.1 runs fine against transformers 5.2 in practice. + "transformers==5.2.0", ] diff --git a/uv.lock b/uv.lock index 2908631..03b542b 100644 --- a/uv.lock +++ b/uv.lock @@ -10,7 +10,10 @@ supported-markers = [ ] [manifest] -overrides = [{ name = "opencv-python-headless", specifier = "<4.13.0" }] +overrides = [ + { name = "opencv-python-headless", specifier = "<4.13.0" }, + { name = "transformers", specifier = "==5.2.0" }, +] [[package]] name = "absl-py" @@ -339,7 +342,7 @@ wheels = [ [[package]] name = "compressed-tensors" -version = "0.13.0" +version = "0.15.0.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "loguru", marker = "sys_platform == 'linux'" }, @@ -347,9 +350,9 @@ dependencies = [ { name = "torch", marker = "sys_platform == 'linux'" }, { name = "transformers", marker = "sys_platform == 'linux'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/fc/65/88dd1c58fb9d0ded51b5c86471b937a1525f91fad2211a6f051dc1ea822d/compressed_tensors-0.13.0.tar.gz", hash = "sha256:23893824d3498ea3f1a829f14a8fa85f9a5e76a34c711a038b8d7c619ca9a67c", size = 200995, upload-time = "2025-12-16T16:03:55.397Z" } +sdist = { url = "https://files.pythonhosted.org/packages/41/1b/c3c4a98ec5f2727656336f07a0c35862195c310d8eb0b2fa5b4be6848680/compressed_tensors-0.15.0.1.tar.gz", hash = "sha256:a8e93054e8a5ec49c980b09ed36c4c1249b4a8ee167920a8e461c4da26e78d99", size = 229412, upload-time = "2026-04-10T14:23:54.708Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/0b/b5/61ac2563c62490922b603c09113a083fd74af3630ec3931e769484d6dcb5/compressed_tensors-0.13.0-py3-none-any.whl", hash = "sha256:3518799c9baf034eb642efb551db6b0537b8713d45a64fe4def26f7f8d6cabec", size = 192620, upload-time = "2025-12-16T16:03:53.041Z" }, + { url = "https://files.pythonhosted.org/packages/a8/52/93833dc1610e017ac5b7dcd59b8304d8ef67d1114c2d124e728a2cbbea12/compressed_tensors-0.15.0.1-py3-none-any.whl", hash = "sha256:e1b1f322e82e475715e242bad46925a304ea8e5c98b5055a15b8eb22fb6bfea9", size = 194260, upload-time = "2026-04-10T14:23:53.098Z" }, ] [[package]] @@ -695,6 +698,14 @@ requires-dist = [ ] provides-extras = ["dev"] +[[package]] +name = "flashinfer-cubin" +version = "0.6.6" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/12/e8/826f9452bc5f76b94d7eb025f03dcaf1b51b9ed7790386c0285191e69be4/flashinfer_cubin-0.6.6-py3-none-any.whl", hash = "sha256:36508dfc792eb5ecfb15d2c140a7702812e1fa1ab0fb03929b2ed55e3e8191f3", size = 267661457, upload-time = "2026-03-11T01:36:36.538Z" }, +] + [[package]] name = "flashinfer-python" version = "0.6.6" @@ -922,21 +933,22 @@ wheels = [ [[package]] name = "huggingface-hub" -version = "0.36.2" +version = "1.15.0" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "filelock", marker = "sys_platform == 'linux'" }, { name = "fsspec", marker = "sys_platform == 'linux'" }, - { name = "hf-xet", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'amd64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "hf-xet", marker = "(platform_machine == 'AMD64' and sys_platform == 'linux') or (platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'amd64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, + { name = "httpx", marker = "sys_platform == 'linux'" }, { name = "packaging", marker = "sys_platform == 'linux'" }, { name = "pyyaml", marker = "sys_platform == 'linux'" }, - { name = "requests", marker = "sys_platform == 'linux'" }, { name = "tqdm", marker = "sys_platform == 'linux'" }, + { name = "typer", marker = "sys_platform == 'linux'" }, { name = "typing-extensions", marker = "sys_platform == 'linux'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/7c/b7/8cb61d2eece5fb05a83271da168186721c450eb74e3c31f7ef3169fa475b/huggingface_hub-0.36.2.tar.gz", hash = "sha256:1934304d2fb224f8afa3b87007d58501acfda9215b334eed53072dd5e815ff7a", size = 649782, upload-time = "2026-02-06T09:24:13.098Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bb/b6/e22bd20a25299c34b8c5922c1545a6320825b13906eb0f7298edfd034a0b/huggingface_hub-1.15.0.tar.gz", hash = "sha256:28abfdddda3927fd4de6a63cf26ab012498a2c24dae52baf150c5c6edf98a1d5", size = 784100, upload-time = "2026-05-15T11:42:52.149Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/a8/af/48ac8483240de756d2438c380746e7130d1c6f75802ef22f3c6d49982787/huggingface_hub-0.36.2-py3-none-any.whl", hash = "sha256:48f0c8eac16145dfce371e9d2d7772854a4f591bcb56c9cf548accf531d54270", size = 566395, upload-time = "2026-02-06T09:24:11.133Z" }, + { url = "https://files.pythonhosted.org/packages/6e/11/0b64cc9024329b76d7547c19a67604a61d21d3ba678a69d1b220c29d5112/huggingface_hub-1.15.0-py3-none-any.whl", hash = "sha256:a4a59af04cbc41a3fe3fec429b171ef994ef8c971eda10136746f408dd4e3744", size = 663602, upload-time = "2026-05-15T11:42:50.487Z" }, ] [[package]] @@ -2910,23 +2922,22 @@ wheels = [ [[package]] name = "transformers" -version = "4.57.6" +version = "5.2.0" source = { registry = "https://pypi.org/simple" } dependencies = [ - { name = "filelock", marker = "sys_platform == 'linux'" }, { name = "huggingface-hub", marker = "sys_platform == 'linux'" }, { name = "numpy", marker = "sys_platform == 'linux'" }, { name = "packaging", marker = "sys_platform == 'linux'" }, { name = "pyyaml", marker = "sys_platform == 'linux'" }, { name = "regex", marker = "sys_platform == 'linux'" }, - { name = "requests", marker = "sys_platform == 'linux'" }, { name = "safetensors", marker = "sys_platform == 'linux'" }, { name = "tokenizers", marker = "sys_platform == 'linux'" }, { name = "tqdm", marker = "sys_platform == 'linux'" }, + { name = "typer-slim", marker = "sys_platform == 'linux'" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/c4/35/67252acc1b929dc88b6602e8c4a982e64f31e733b804c14bc24b47da35e6/transformers-4.57.6.tar.gz", hash = "sha256:55e44126ece9dc0a291521b7e5492b572e6ef2766338a610b9ab5afbb70689d3", size = 10134912, upload-time = "2026-01-16T10:38:39.284Z" } +sdist = { url = "https://files.pythonhosted.org/packages/bd/7e/8a0c57d562015e5b16c97c1f0b8e0e92ead2c7c20513225dc12c2043ba9f/transformers-5.2.0.tar.gz", hash = "sha256:0088b8b46ccc9eff1a1dca72b5d618a5ee3b1befc3e418c9512b35dea9f9a650", size = 8618176, upload-time = "2026-02-16T18:54:02.867Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/03/b8/e484ef633af3887baeeb4b6ad12743363af7cce68ae51e938e00aaa0529d/transformers-4.57.6-py3-none-any.whl", hash = "sha256:4c9e9de11333ddfe5114bc872c9f370509198acf0b87a832a0ab9458e2bd0550", size = 11993498, upload-time = "2026-01-16T10:38:31.289Z" }, + { url = "https://files.pythonhosted.org/packages/4e/93/79754b0ca486e556c2b95d4f5afc66aaf4b260694f3d6e1b51da2d036691/transformers-5.2.0-py3-none-any.whl", hash = "sha256:9ecaf243dc45bee11a7d93f8caf03746accc0cb069181bbf4ad8566c53e854b4", size = 10403304, upload-time = "2026-02-16T18:53:59.699Z" }, ] [[package]] @@ -2953,6 +2964,18 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/4a/91/48db081e7a63bb37284f9fbcefda7c44c277b18b0e13fbc36ea2335b71e6/typer-0.24.1-py3-none-any.whl", hash = "sha256:112c1f0ce578bfb4cab9ffdabc68f031416ebcc216536611ba21f04e9aa84c9e", size = 56085, upload-time = "2026-02-21T16:54:41.616Z" }, ] +[[package]] +name = "typer-slim" +version = "0.24.0" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "typer", marker = "sys_platform == 'linux'" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/a7/a7/e6aecc4b4eb59598829a3b5076a93aff291b4fdaa2ded25efc4e1f4d219c/typer_slim-0.24.0.tar.gz", hash = "sha256:f0ed36127183f52ae6ced2ecb2521789995992c521a46083bfcdbb652d22ad34", size = 4776, upload-time = "2026-02-16T22:08:51.2Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a7/24/5480c20380dfd18cf33d14784096dca45a24eae6102e91d49a718d3b6855/typer_slim-0.24.0-py3-none-any.whl", hash = "sha256:d5d7ee1ee2834d5020c7c616ed5e0d0f29b9a4b1dd283bdebae198ec09778d0e", size = 3394, upload-time = "2026-02-16T22:08:49.92Z" }, +] + [[package]] name = "typing-extensions" version = "4.15.0" @@ -3020,8 +3043,8 @@ wheels = [ [[package]] name = "veomni" -version = "0.1.9a4" -source = { git = "https://github.com/ByteDance-Seed/VeOmni.git?rev=58759e78015ad429507079aa443215e3c515364f#58759e78015ad429507079aa443215e3c515364f" } +version = "0.1.9a5" +source = { git = "https://github.com/ByteDance-Seed/VeOmni.git?rev=a4ed599119afb21f5e559f15e95635f0edbbc5c6#a4ed599119afb21f5e559f15e95635f0edbbc5c6" } dependencies = [ { name = "blobfile", marker = "sys_platform == 'linux'" }, { name = "datasets", marker = "sys_platform == 'linux'" }, @@ -3088,6 +3111,7 @@ gpu = [ { name = "torchvision", marker = "sys_platform == 'linux'" }, ] veomni = [ + { name = "transformers", marker = "sys_platform == 'linux'" }, { name = "veomni", marker = "sys_platform == 'linux'" }, ] verl = [ @@ -3119,10 +3143,11 @@ requires-dist = [ { name = "torchaudio", marker = "extra == 'gpu'", index = "https://download.pytorch.org/whl/cu129" }, { name = "torchvision", marker = "extra == 'gpu'", index = "https://download.pytorch.org/whl/cu129" }, { name = "transformers" }, + { name = "transformers", marker = "extra == 'veomni'", specifier = "==5.2.0" }, { name = "uvicorn", marker = "extra == 'verl'" }, - { name = "veomni", marker = "extra == 'veomni'", git = "https://github.com/ByteDance-Seed/VeOmni.git?rev=58759e78015ad429507079aa443215e3c515364f" }, + { name = "veomni", marker = "extra == 'veomni'", git = "https://github.com/ByteDance-Seed/VeOmni.git?rev=a4ed599119afb21f5e559f15e95635f0edbbc5c6" }, { name = "verl", marker = "extra == 'verl'", git = "https://github.com/verl-project/verl.git?rev=61f29997fb026a5a269dafccfe2f3bb800e32ef4" }, - { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.18.0" }, + { name = "vllm", marker = "extra == 'vllm'", specifier = "==0.19.1" }, ] provides-extras = ["gpu", "dev", "vllm", "verl", "veomni"] @@ -3143,7 +3168,7 @@ wheels = [ [[package]] name = "vllm" -version = "0.18.0" +version = "0.19.1" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "aiohttp", marker = "sys_platform == 'linux'" }, @@ -3158,6 +3183,7 @@ dependencies = [ { name = "einops", marker = "sys_platform == 'linux'" }, { name = "fastapi", extra = ["standard"], marker = "sys_platform == 'linux'" }, { name = "filelock", marker = "sys_platform == 'linux'" }, + { name = "flashinfer-cubin", marker = "sys_platform == 'linux'" }, { name = "flashinfer-python", marker = "sys_platform == 'linux'" }, { name = "gguf", marker = "sys_platform == 'linux'" }, { name = "ijson", marker = "sys_platform == 'linux'" }, @@ -3211,10 +3237,10 @@ dependencies = [ { name = "watchfiles", marker = "sys_platform == 'linux'" }, { name = "xgrammar", marker = "(platform_machine == 'aarch64' and sys_platform == 'linux') or (platform_machine == 'arm64' and sys_platform == 'linux') or (platform_machine == 'ppc64le' and sys_platform == 'linux') or (platform_machine == 's390x' and sys_platform == 'linux') or (platform_machine == 'x86_64' and sys_platform == 'linux')" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/39/a9/ed48c497572a41552cc71b5ede15291d882c546a01df27ed42944eb3b3ad/vllm-0.18.0.tar.gz", hash = "sha256:9a1bee091db8dbb4664a2a09cd9c61912e9912a44af1ce12b8593a231d05971c", size = 30812817, upload-time = "2026-03-20T22:16:59.039Z" } +sdist = { url = "https://files.pythonhosted.org/packages/a8/49/60a2a962ecbf780c8fbfd0d5548b208d654d5c4267df94d8d93883641431/vllm-0.19.1.tar.gz", hash = "sha256:9fb88ce6b50991eba41d183584f65f51d7f6015d86a42cdabf79c1c8bd5d66fa", size = 31105401, upload-time = "2026-04-18T05:50:15.143Z" } wheels = [ - { url = "https://files.pythonhosted.org/packages/37/f8/d677aef63536b54d8d0268241dc50a4fe01efcf162ef26190b9ed4bed109/vllm-0.18.0-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:66a2c5bcf1bdf8de3e63b9fee067754068108cd510c65ffba70ff4368c33cba8", size = 385589729, upload-time = "2026-03-20T22:16:38.734Z" }, - { url = "https://files.pythonhosted.org/packages/4f/e9/59cf9b8939b51e859d2166ac3336b353f52ec4f9ceda34228aae7b386840/vllm-0.18.0-cp38-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:0bc51491598f4bcd161b693b27cbe2864082d6c49fa9065965d94b371f6ae8ef", size = 433215727, upload-time = "2026-03-20T22:16:00.336Z" }, + { url = "https://files.pythonhosted.org/packages/28/4c/26c426103c58ac8d98435fe63c7758a2f289b5481a08be19e9c9fe29a4c2/vllm-0.19.1-cp38-abi3-manylinux_2_31_aarch64.whl", hash = "sha256:c8dde3c9af20f00a644e64a50ebe43948f2921bab3ffd5407d634c15836cb181", size = 385252556, upload-time = "2026-04-18T05:49:16.101Z" }, + { url = "https://files.pythonhosted.org/packages/78/20/f41216b79c87372a9d03175f36fa1411ee61059ce8c557d2691722ea4aae/vllm-0.19.1-cp38-abi3-manylinux_2_31_x86_64.whl", hash = "sha256:71a87f46cafab4489c69a5c5c83b870d0235e5694d8222303d460576293dc719", size = 433132101, upload-time = "2026-04-18T05:49:54.202Z" }, ] [[package]] From e7e4f88114c636e7eec8730d4bc7af446de0f579 Mon Sep 17 00:00:00 2001 From: "tianle.zhong" Date: Thu, 21 May 2026 20:08:23 +0000 Subject: [PATCH 2/4] [model] fix: align vexact MoE rollout with VeOmni v5 actor side MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Without these fixes vexact rollout drifts severely from the actor side on both supported MoE architectures under transformers v5 (``rollout_probs_diff_max`` ≈0.99/0.998), making bitwise-aligned RL training impossible. Five concrete causes; one fix per cause: 1. ``ModelCreator`` allocated the ``"weights"`` TorchMemorySaver region with ``enable_cpu_backup=False``. ``rollout.release()`` pauses every region, so weights were *freed* on pause and came back as uninitialised garbage on resume. Flip the flag to ``True`` so the offload preserves values. 2. VeOmni v5's actor exposes MoE experts as a fused ``mlp.experts.gate_up_proj`` (shape ``[E, 2I, H]``), but vexact stored per-projection ``gate_proj`` / ``up_proj`` tensors. verl's bucketed FSDP→rollout transfer therefore silently dropped every expert key. Move vexact's ``Qwen3MoeExperts`` and ``PatchDeepseekV3NaiveMoe`` to the fused layout, slice into per-projection views at MoE-forward time. 3. Extend the per-expert loader to also accept ``gate_up_proj.weight`` (the bucketed sync ships ``mlp.experts.{idx}.gate_up_proj.weight``) and to know how to write the fused destination from per-projection disk keys. 4. Use VeOmni's ``veomni.ops.fused_moe_forward`` (``fc1_1_2_weight=…`` merged-fc1 path) on the rollout side too so the kernels match the actor's exactly. Initialise ``veomni.distributed.parallel_state`` (non-EP, ``dp_size=world_size``) in ``Worker.__init__`` because the group_gemm/quack/npu kernels read ``get_parallel_state().ep_enabled`` on every forward and would otherwise crash with ``ValueError: product of parallel sizes…`` under PP>1. The kernel binding tolerates CPU-only worker processes (AgentLoopWorker, etc.). 5. For deepseek_v3: VeOmni v5's stock ``DeepseekV3Attention.forward`` pads ``value_states`` to ``qk_head_dim`` when FA is requested, forcing FA4 onto its non-MLA codegen path. vexact uses FA4's MLA-native path (unpadded V=128). Patch VeOmni's actor-side attention / RoPE / RMSNorm modules to use vexact's MLA-native versions so the two sides hit identical kernel call signatures. The three v4 ``trainer.use_legacy_worker_impl=disable`` Hydra overrides in ``examples/moe/run_qwen3_30B_A3B_*.sh`` are removed; that knob no longer exists in the current verl pin and the recipes errored out immediately when run. Smoke verification (1x8 H100, transformers 5.2.0, veomni a4ed599): Qwen3-30B-A3B (qwen3_moe), examples/moe/run_qwen3_30B_A3B_dapo.sh step:1 rollout_probs_diff_max=0.0 pearson_corr=1.0 entropy=0.124 Moonlight-16B-A3B (deepseek_v3 / MLA), examples/moe/run_moonlight_gsm8k.sh step:1 rollout_probs_diff_max=0.0 pearson_corr=1.0 entropy=0.036 Both archs now match the dense Qwen3-1.7B baseline. --- examples/moe/run_qwen3_30B_A3B_16H100.sh | 1 - examples/moe/run_qwen3_30B_A3B_dapo.sh | 1 - examples/moe/run_qwen3_30B_A3B_reinforce.sh | 1 - vexact/inferencer/model_loader.py | 8 +- .../deepseek_v3/modeling_deepseek_v3.py | 93 +++++++++++++++---- vexact/models/qwen3_moe/modeling_qwen3_moe.py | 69 ++++++++++---- vexact/models/register.py | 22 +++++ vexact/worker/worker.py | 14 +++ 8 files changed, 167 insertions(+), 42 deletions(-) diff --git a/examples/moe/run_qwen3_30B_A3B_16H100.sh b/examples/moe/run_qwen3_30B_A3B_16H100.sh index 6c63c4b..7d33df2 100644 --- a/examples/moe/run_qwen3_30B_A3B_16H100.sh +++ b/examples/moe/run_qwen3_30B_A3B_16H100.sh @@ -81,7 +81,6 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.n=8 \ actor_rollout_ref.ref.veomni.optimizer_offload=True \ algorithm.use_kl_in_reward=False \ - trainer.use_legacy_worker_impl=disable \ trainer.critic_warmup=0 \ trainer.logger=['console','wandb'] \ trainer.project_name='verl_grpo_example_gsm8k_math' \ diff --git a/examples/moe/run_qwen3_30B_A3B_dapo.sh b/examples/moe/run_qwen3_30B_A3B_dapo.sh index 3327725..f9708eb 100644 --- a/examples/moe/run_qwen3_30B_A3B_dapo.sh +++ b/examples/moe/run_qwen3_30B_A3B_dapo.sh @@ -83,7 +83,6 @@ python3 -m verl.trainer.main_ppo \ actor_rollout_ref.rollout.profiler.save_path=$profile_save_path \ actor_rollout_ref.ref.veomni.optimizer_offload=True \ algorithm.use_kl_in_reward=False \ - trainer.use_legacy_worker_impl=disable \ trainer.critic_warmup=0 \ trainer.logger=['console','wandb'] \ trainer.project_name='verl_grpo_qwen3moe_dapo' \ diff --git a/examples/moe/run_qwen3_30B_A3B_reinforce.sh b/examples/moe/run_qwen3_30B_A3B_reinforce.sh index 626ed12..ed0d416 100644 --- a/examples/moe/run_qwen3_30B_A3B_reinforce.sh +++ b/examples/moe/run_qwen3_30B_A3B_reinforce.sh @@ -94,7 +94,6 @@ RAY_DEDUP_LOGS=0 PYTHONUNBUFFERED=1 python3 -m verl.trainer.main_ppo \ custom_reward_function.name=compute_math_score \ trainer.project_name=vexact-baseline-math-moe-reinforce \ trainer.experiment_name=vexact-exp-MOE \ - trainer.use_legacy_worker_impl=disable \ trainer.test_freq=20 \ trainer.log_val_generations=20 \ trainer.val_before_train=True \ diff --git a/vexact/inferencer/model_loader.py b/vexact/inferencer/model_loader.py index 03f113f..5ac1921 100644 --- a/vexact/inferencer/model_loader.py +++ b/vexact/inferencer/model_loader.py @@ -298,7 +298,13 @@ def create_model(self): if self._pp_info.pp_size > 1: self._apply_pp() - with TorchMemorySaverAdapter.get_instance().region("weights", enable_cpu_backup=False): + # ``enable_cpu_backup=True``: torch_memory_saver offloads weights to + # CPU on pause and restores them on resume. Without this, pause→resume + # leaves the GPU memory uninitialized and the model produces garbage + # logits whenever any weight key isn't re-covered by the subsequent + # FSDP→rollout sync (notably an issue for MoE archs whose actor-side + # state_dict naming evolves across transformers releases). + with TorchMemorySaverAdapter.get_instance().region("weights", enable_cpu_backup=True): init_parameters(self._causal_model, self._config.dtype, self._device) load_weights_from_weight_path(self._causal_model, self._config, self._model_path) diff --git a/vexact/models/deepseek_v3/modeling_deepseek_v3.py b/vexact/models/deepseek_v3/modeling_deepseek_v3.py index ea29065..5750043 100644 --- a/vexact/models/deepseek_v3/modeling_deepseek_v3.py +++ b/vexact/models/deepseek_v3/modeling_deepseek_v3.py @@ -100,22 +100,26 @@ def forward(self, hidden_states): class PatchDeepseekV3NaiveMoe(nn.Module): - """Identical to VeOmni's PatchDeepseekV3NaiveMoe, but uses vexact's fused_moe_forward.""" + """Identical to VeOmni's PatchDeepseekV3NaiveMoe, but uses vexact's fused_moe_forward. + + Storage layout matches VeOmni's v5-patched ``DeepseekV3NaiveMoe`` (fused + ``gate_up_proj`` of shape ``(num_experts, 2*intermediate, hidden)`` plus + separate ``down_proj``). This keeps the rollout-side parameter names + identical to the actor-side names so verl's bucketed FSDP→rollout weight + sync copies tensors directly without an unfuse step. + """ def __init__(self, config): super().__init__() self.num_experts = config.n_routed_experts self.hidden_dim = config.hidden_size self.intermediate_dim = config.moe_intermediate_size - self.gate_proj = nn.Parameter(torch.empty(self.num_experts, self.intermediate_dim, self.hidden_dim)) - self.up_proj = nn.Parameter(torch.empty(self.num_experts, self.intermediate_dim, self.hidden_dim)) + self.gate_up_proj = nn.Parameter(torch.empty(self.num_experts, 2 * self.intermediate_dim, self.hidden_dim)) self.down_proj = nn.Parameter(torch.empty(self.num_experts, self.hidden_dim, self.intermediate_dim)) self.act_fn = ACT2FN[config.hidden_act] self._moe_implementation = getattr(config, "_moe_implementation", "fused") def forward(self, hidden_states, top_k_index, top_k_weights): - from vexact.batch_invariant_ops.fused_moe import fused_moe_forward - final_hidden_states = torch.zeros_like(hidden_states) if self._moe_implementation == "eager": @@ -130,22 +134,27 @@ def forward(self, hidden_states, top_k_index, top_k_weights): continue top_k_pos, token_idx = torch.where(expert_mask[expert_idx]) current_state = hidden_states[token_idx] - gate = nn.functional.linear(current_state, self.gate_proj[expert_idx]) - up = nn.functional.linear(current_state, self.up_proj[expert_idx]) + gate, up = nn.functional.linear(current_state, self.gate_up_proj[expert_idx]).chunk(2, dim=-1) current_hidden_states = self.act_fn(gate) * up current_hidden_states = nn.functional.linear(current_hidden_states, self.down_proj[expert_idx]) current_hidden_states = current_hidden_states * top_k_weights[token_idx, top_k_pos, None] final_hidden_states.index_add_(0, token_idx, current_hidden_states.to(final_hidden_states.dtype)) elif self._moe_implementation == "fused": + # Use VeOmni's fused MoE kernel directly (fc1_1_2_weight path), so + # rollout-side and actor-side experts compute with identical + # arithmetic — required for `rollout_probs_diff_max == 0`. + from veomni.ops import fused_moe_forward + top_k_weights = top_k_weights.to(final_hidden_states.dtype) final_hidden_states = fused_moe_forward( num_experts=self.num_experts, routing_weights=top_k_weights, selected_experts=top_k_index, hidden_states=hidden_states, - fc1_1_weight=self.gate_proj, - fc1_2_weight=self.up_proj, + fc1_1_weight=None, + fc1_2_weight=None, fc2_weight=self.down_proj, + fc1_1_2_weight=self.gate_up_proj, ) else: raise ValueError(f"Invalid moe implementation: {self._moe_implementation}") @@ -300,7 +309,7 @@ def _patched_load_pretrained_model( ### then weights are loaded by load_weights. ### We override load_weights to fuse expert weights. ### These should be removed after we upgrade to HF Transformers v5. -_EXPERT_PROJS = {"gate_proj", "up_proj", "down_proj"} +_EXPERT_PROJS = {"gate_proj", "up_proj", "down_proj", "gate_up_proj"} def load_deepseek_v3_weights( @@ -322,11 +331,15 @@ def load_deepseek_v3_weights( for full_name, loaded_weight in weight_iterator: if ".experts." in full_name and full_name.endswith(".weight"): - # Expected expert key format: + # Disk checkpoints (HF) store experts as per-expert separate keys: # .experts...weight # Example: # model.layers.3.mlp.experts.42.gate_proj.weight - # Copy received single expert weight into corresponding position in fused expert params + # Fuse them into rollout-side ``gate_up_proj``/``down_proj`` stacked + # tensors at the right per-expert slot. FSDP-sync from the VeOmni v5 + # actor delivers experts as ``.experts.gate_up_proj`` / + # ``.down_proj`` (already fused) — those names land directly in + # ``full_param_dict`` via the elif branch below. prefix, rest = full_name.split(".experts.", 1) try: expert_idx_str, proj, suffix = rest.split(".") @@ -339,10 +352,24 @@ def load_deepseek_v3_weights( except AttributeError: block = None experts = getattr(block, "experts", None) if block is not None else None - target = getattr(experts, proj, None) if isinstance(experts, PatchDeepseekV3NaiveMoe) else None - if target is not None and expert_idx < target.shape[0]: + if isinstance(experts, PatchDeepseekV3NaiveMoe) and expert_idx < experts.num_experts: with torch.no_grad(): - target[expert_idx].copy_(loaded_weight.to(device=target.device, dtype=target.dtype)) + if proj == "gate_proj": + target_slice = experts.gate_up_proj[expert_idx, : experts.intermediate_dim, :] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) + elif proj == "up_proj": + target_slice = experts.gate_up_proj[expert_idx, experts.intermediate_dim :, :] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) + elif proj == "gate_up_proj": + # FSDP-sync path: actor (VeOmni v5) ships the + # already-fused per-expert tensor under + # ``experts.{idx}.gate_up_proj.weight``; copy + # straight into the matching per-expert slot. + target_slice = experts.gate_up_proj[expert_idx] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) + elif proj == "down_proj": + target_slice = experts.down_proj[expert_idx] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) direct_loaded_blocks.add(prefix) continue @@ -426,12 +453,42 @@ def apply_deepseek_v3_patches() -> None: DeepseekV3Attention.forward = deepseek_v3_attention_forward - # Patch RotaryEmbedding to use deterministic Triton bmm for cos/sin computation + # Align actor-side and rollout-side ``DeepseekV3Attention`` / RoPE / + # RMSNorm code paths so the bitwise-aligned MoE rollout works under v5. + # + # 1. Attention: VeOmni v5's stock ``DeepseekV3Attention.forward`` pads + # ``value_states`` to ``qk_head_dim`` when FA is requested. This forces + # the FA4 kernel onto its standard (non-MLA) codegen path, which is + # numerically distinct from the MLA-native path that vexact's rollout + # uses (unpadded V=128). Reuse the rollout's forward on actor too. + # 2. RoPE / RMSNorm: use vexact's deterministic Triton-bmm RoPE and + # batch-invariant RMSNorm on actor too so the cos/sin and normalised + # activations match the rollout side bit-for-bit. + deterministic_rope_forward = _make_deterministic_rope_forward() + from transformers.models.deepseek_v3.modeling_deepseek_v3 import DeepseekV3RotaryEmbedding - DeepseekV3RotaryEmbedding.forward = _make_deterministic_rope_forward() + DeepseekV3RotaryEmbedding.forward = deterministic_rope_forward + + try: + import veomni.models.transformers.deepseek_v3.generated.patched_modeling_deepseek_v3_gpu as _veomni_dsv3 + + _veomni_dsv3.DeepseekV3Attention.forward = deepseek_v3_attention_forward + _veomni_dsv3.DeepseekV3RotaryEmbedding.forward = deterministic_rope_forward + # Match vexact's batch-invariant RMSNorm on actor side too. + from vexact.batch_invariant_ops import batch_invariant_rms_norm as _bi_rms_norm + + def _bi_rms_norm_forward(self, hidden_states): + return _bi_rms_norm(hidden_states, self.weight, self.variance_epsilon) + + _veomni_dsv3.DeepseekV3RMSNorm.forward = _bi_rms_norm_forward + logger.info( + "[VEXACT] Patched VeOmni actor-side DeepseekV3Attention/RoPE/RMSNorm to match rollout" + ) + except Exception as e: + logger.info(f"[VEXACT] Skipped VeOmni actor-side attention/RoPE/RMSNorm patch ({e})") - # Patch RMSNorm to use batch-invariant Triton kernel + # Patch transformers-stock RMSNorm too (used by rollout side via stock class). _patch_rms_norm_batch_invariant() logger.info("Applied DeepSeek-V3 monkey patches.") diff --git a/vexact/models/qwen3_moe/modeling_qwen3_moe.py b/vexact/models/qwen3_moe/modeling_qwen3_moe.py index 4d6fbbc..c52b063 100644 --- a/vexact/models/qwen3_moe/modeling_qwen3_moe.py +++ b/vexact/models/qwen3_moe/modeling_qwen3_moe.py @@ -42,6 +42,12 @@ class Qwen3MoeExperts(nn.Module): """ Fused experts container that stores all expert weights in stacked tensors. + + Storage layout matches VeOmni's v5-patched ``Qwen3MoeExperts`` (fused + ``gate_up_proj`` of shape ``(num_experts, 2*intermediate, hidden)`` plus + separate ``down_proj``). This keeps the rollout-side parameter names + identical to the actor-side names so that verl's bucketed FSDP→rollout + weight sync can copy tensors directly without an unfuse step. """ def __init__(self, config: Qwen3MoeConfig): @@ -51,12 +57,8 @@ def __init__(self, config: Qwen3MoeConfig): self.intermediate_size = config.moe_intermediate_size self.act_fn = ACT2FN[config.hidden_act] - self.gate_proj = nn.Parameter( - torch.empty(self.num_experts, self.intermediate_size, self.hidden_dim), - requires_grad=True, - ) - self.up_proj = nn.Parameter( - torch.empty(self.num_experts, self.intermediate_size, self.hidden_dim), + self.gate_up_proj = nn.Parameter( + torch.empty(self.num_experts, 2 * self.intermediate_size, self.hidden_dim), requires_grad=True, ) self.down_proj = nn.Parameter( @@ -72,25 +74,29 @@ def forward( selected_experts: Optional[torch.Tensor] = None, ) -> torch.Tensor: if expert_idx is not None: - gate_proj_out = torch.matmul(hidden_states, self.gate_proj[expert_idx].transpose(0, 1)) - up_proj_out = torch.matmul(hidden_states, self.up_proj[expert_idx].transpose(0, 1)) - hidden = self.act_fn(gate_proj_out) * up_proj_out + gate_up = torch.matmul(hidden_states, self.gate_up_proj[expert_idx].transpose(0, 1)) + gate, up = gate_up.chunk(2, dim=-1) + hidden = self.act_fn(gate) * up return torch.matmul(hidden, self.down_proj[expert_idx].transpose(0, 1)) assert routing_weights is not None and selected_experts is not None, ( "routing_weights and selected_experts must be provided when expert_idx is None" ) - from vexact.batch_invariant_ops.fused_moe import fused_moe_forward + # Use VeOmni's fused MoE kernel directly (fc1_1_2_weight path), so the + # rollout side and VeOmni's actor side compute experts with identical + # arithmetic — required for `rollout_probs_diff_max == 0`. + from veomni.ops import fused_moe_forward return fused_moe_forward( num_experts=self.num_experts, routing_weights=routing_weights, selected_experts=selected_experts, hidden_states=hidden_states, - fc1_1_weight=self.gate_proj, - fc1_2_weight=self.up_proj, + fc1_1_weight=None, + fc1_2_weight=None, fc2_weight=self.down_proj, + fc1_1_2_weight=self.gate_up_proj, ) @@ -133,7 +139,10 @@ def moe_block_forward(self: Qwen3MoeSparseMoeBlock, hidden_states: torch.Tensor) selected_experts=selected_experts, ) final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim) - return final_hidden_states, router_logits + # transformers v5: Qwen3MoeSparseMoeBlock.forward returns only hidden_states + # (router_logits collected via OutputRecorder hook on Qwen3MoeTopKRouter, + # which our plain-Linear gate bypasses; rollout doesn't need aux-loss stats). + return final_hidden_states ### Model patching ends @@ -178,7 +187,7 @@ def _patched_load_pretrained_model( ### then weights are loaded by load_weights, ### we will override the load_weights method to fuse experts weights ### These should be removed after we upgrade to HF Transformers v5. -_EXPERT_PROJS = {"gate_proj", "up_proj", "down_proj"} +_EXPERT_PROJS = {"gate_proj", "up_proj", "down_proj", "gate_up_proj"} def load_qwen3_moe_weights( @@ -205,11 +214,12 @@ def load_qwen3_moe_weights( # extra name and weight tensor from the weight iterator for full_name, loaded_weight in weight_iterator: if ".experts." in full_name and full_name.endswith(".weight"): - # Expected expert key format: + # Disk checkpoints (HF) store experts as per-expert separate keys: # .experts...weight # Example: # model.layers.0.mlp.experts.3.gate_proj.weight - # Copy received single expert weight into corresponding position in fused expert params + # Fuse them into the rollout-side ``gate_up_proj``/``down_proj`` + # stacked tensors at the right per-expert slot. prefix, rest = full_name.split(".experts.", 1) try: expert_idx_str, proj, suffix = rest.split(".") @@ -217,20 +227,39 @@ def load_qwen3_moe_weights( # Not a 3-part suffix, skip. expert_idx_str, proj, suffix = None, None, None if suffix == "weight" and proj in _EXPERT_PROJS and expert_idx_str is not None and expert_idx_str.isdigit(): - # Copy per-expert weights directly into fused expert params. expert_idx = int(expert_idx_str) try: block = self.get_submodule(prefix) except AttributeError: block = None experts = getattr(block, "experts", None) if block is not None else None - target = getattr(experts, proj, None) if isinstance(experts, Qwen3MoeExperts) else None - if target is not None and expert_idx < target.shape[0]: + if isinstance(experts, Qwen3MoeExperts) and expert_idx < experts.num_experts: with torch.no_grad(): - target[expert_idx].copy_(loaded_weight.to(device=target.device, dtype=target.dtype)) + if proj == "gate_proj": + target_slice = experts.gate_up_proj[expert_idx, : experts.intermediate_size, :] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) + elif proj == "up_proj": + target_slice = experts.gate_up_proj[expert_idx, experts.intermediate_size :, :] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) + elif proj == "gate_up_proj": + # FSDP-sync path: actor (VeOmni v5) ships the + # already-fused per-expert tensor under + # ``experts.{idx}.gate_up_proj.weight``; copy + # straight into the matching per-expert slot. + target_slice = experts.gate_up_proj[expert_idx] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) + elif proj == "down_proj": + target_slice = experts.down_proj[expert_idx] + target_slice.copy_(loaded_weight.to(device=target_slice.device, dtype=target_slice.dtype)) direct_loaded_blocks.add(prefix) continue + # FSDP-sync from the actor (VeOmni v5) delivers experts as fused stacked + # tensors named ``.experts.gate_up_proj`` and + # ``.experts.down_proj``. Those names land directly in + # ``full_param_dict`` (our experts are stored in the same fused layout), + # so the elif below copies them in one shot. + # in the checkpoints there is a weight with name model.embed_tokens.weight if full_name == "model.embed_tokens.weight": embed_tokens_weight = loaded_weight diff --git a/vexact/models/register.py b/vexact/models/register.py index 4ae9aae..cd9f8ea 100644 --- a/vexact/models/register.py +++ b/vexact/models/register.py @@ -28,6 +28,28 @@ def register_models() -> None: disable_vexact_patch = os.getenv("VEXACT_DISABLE_MODEL_PATCH", "0") == "1" if not disable_vexact_patch: + # Bind VeOmni's fused MoE kernel so the rollout side reuses the same + # implementation as VeOmni's actor side (required for bitwise + # alignment on MoE archs). Prefer quack (SM90+, what VeOmni's + # ``moe_implementation=fused`` resolves to on GPU) and fall back to + # triton if quack is unavailable. + # + # ``register_models`` runs in every verl process that imports vexact, + # including CPU-only AgentLoopWorker / data workers that don't have + # the GPU kernels available. Skip silently there. + from veomni.ops.kernels.moe import apply_veomni_fused_moe_patch + + moe_kernel = os.getenv("VEXACT_MOE_KERNEL", "quack") + try: + apply_veomni_fused_moe_patch(fused_moe_kernel=moe_kernel) + print(f"[VEXACT] register_models(): bound VeOmni fused MoE kernel ({moe_kernel})") + except RuntimeError as e_quack: + try: + apply_veomni_fused_moe_patch(fused_moe_kernel="triton") + print(f"[VEXACT] register_models(): bound VeOmni fused MoE kernel (triton, '{moe_kernel}' unavailable: {e_quack})") + except RuntimeError as e_triton: + print(f"[VEXACT] register_models(): skipping VeOmni MoE kernel binding (no GPU kernel available: quack={e_quack}; triton={e_triton})") + from .qwen3_moe.modeling_qwen3_moe import apply_qwen3_moe_patches apply_qwen3_moe_patches() diff --git a/vexact/worker/worker.py b/vexact/worker/worker.py index 678bae0..c4c303d 100644 --- a/vexact/worker/worker.py +++ b/vexact/worker/worker.py @@ -110,6 +110,20 @@ def __init__(self, config: VeXactConfig, rank: int): if self.pp_info.pp_size > 1: pp_messager = PPMessager(pp_info=self.pp_info, parallel_config=config.parallel, device=self.device) + # VeOmni's fused MoE kernels (group_gemm/quack/npu) consult + # ``get_parallel_state().ep_enabled`` on each forward pass to decide + # between the EP and non-EP code paths. The lazy default + # ``ParallelState()`` asserts ``pp*dp*cp*ulysses*tp == world_size`` and + # therefore raises whenever torch.distributed reports world_size > 1 + # (e.g. our PP>1 rollout). Bind a non-EP parallel state up front so + # subsequent MoE forwards take the cheap non-EP path; we never run + # expert parallelism inside the rollout worker. The helper is + # idempotent (warns + early-return when state already exists). + if torch.distributed.is_initialized(): + from veomni.distributed.parallel_state import init_parallel_state + + init_parallel_state(dp_size=torch.distributed.get_world_size()) + self.inferencer = Inferencer( model=self.model, config=config, From 103ef489d6a82a14fe7667c13ced41fc4248c67a Mon Sep 17 00:00:00 2001 From: "tianle.zhong" Date: Thu, 21 May 2026 20:14:50 +0000 Subject: [PATCH 3/4] [model] fix: wrap long MoE-kernel-binding log lines under 120 cols --- vexact/models/register.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/vexact/models/register.py b/vexact/models/register.py index cd9f8ea..696a20a 100644 --- a/vexact/models/register.py +++ b/vexact/models/register.py @@ -46,9 +46,15 @@ def register_models() -> None: except RuntimeError as e_quack: try: apply_veomni_fused_moe_patch(fused_moe_kernel="triton") - print(f"[VEXACT] register_models(): bound VeOmni fused MoE kernel (triton, '{moe_kernel}' unavailable: {e_quack})") + print( + f"[VEXACT] register_models(): bound VeOmni fused MoE kernel " + f"(triton, '{moe_kernel}' unavailable: {e_quack})" + ) except RuntimeError as e_triton: - print(f"[VEXACT] register_models(): skipping VeOmni MoE kernel binding (no GPU kernel available: quack={e_quack}; triton={e_triton})") + print( + "[VEXACT] register_models(): skipping VeOmni MoE kernel binding " + f"(no GPU kernel available: quack={e_quack}; triton={e_triton})" + ) from .qwen3_moe.modeling_qwen3_moe import apply_qwen3_moe_patches From d5f6ffbd977a59b7ee24087dc4679792f0aefad5 Mon Sep 17 00:00:00 2001 From: "tianle.zhong" Date: Thu, 21 May 2026 20:16:14 +0000 Subject: [PATCH 4/4] [model] fix: ruff format vexact deepseek_v3 --- vexact/models/deepseek_v3/modeling_deepseek_v3.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/vexact/models/deepseek_v3/modeling_deepseek_v3.py b/vexact/models/deepseek_v3/modeling_deepseek_v3.py index 5750043..2eeea45 100644 --- a/vexact/models/deepseek_v3/modeling_deepseek_v3.py +++ b/vexact/models/deepseek_v3/modeling_deepseek_v3.py @@ -482,9 +482,7 @@ def _bi_rms_norm_forward(self, hidden_states): return _bi_rms_norm(hidden_states, self.weight, self.variance_epsilon) _veomni_dsv3.DeepseekV3RMSNorm.forward = _bi_rms_norm_forward - logger.info( - "[VEXACT] Patched VeOmni actor-side DeepseekV3Attention/RoPE/RMSNorm to match rollout" - ) + logger.info("[VEXACT] Patched VeOmni actor-side DeepseekV3Attention/RoPE/RMSNorm to match rollout") except Exception as e: logger.info(f"[VEXACT] Skipped VeOmni actor-side attention/RoPE/RMSNorm patch ({e})")