verl-project · mikequan0425 · May 27, 2026 · May 27, 2026 · May 27, 2026 · May 27, 2026
@@ -35,9 +35,12 @@ actor_rollout_ref:
       init_device: meta
       enable_full_shard: true
       ckpt_manager: dcp
+      load_checkpoint_path: null
       forward_prefetch: true
       strategy: veomni
       use_torch_compile: false
+      entropy_from_logits_with_chunking: false
+      entropy_checkpointing: false
       forward_only: false
       enable_fsdp_offload: false
       enable_reentrant: false
@@ -47,9 +50,13 @@ actor_rollout_ref:
       rms_norm_implementation: eager
       swiglu_mlp_implementation: eager
       rotary_pos_emb_implementation: eager
+      rms_norm_gated_implementation: eager
+      causal_conv1d_implementation: eager
+      chunk_gated_delta_rule_implementation: eager
       load_balancing_loss_implementation: eager
       force_use_huggingface: false
       activation_gpu_limit: 0.0
+      basic_modules: []
       moe_load_balance_monitor_interval: 0
       router_replay:
         _target_: verl.workers.config.EngineRouterReplayConfig
@@ -205,9 +212,12 @@ actor_rollout_ref:
       init_device: meta
       enable_full_shard: true
       ckpt_manager: dcp
+      load_checkpoint_path: null
       forward_prefetch: true
       strategy: veomni
       use_torch_compile: false
+      entropy_from_logits_with_chunking: false
+      entropy_checkpointing: false
       forward_only: true
       enable_fsdp_offload: false
       enable_reentrant: false
@@ -217,9 +227,13 @@ actor_rollout_ref:
       rms_norm_implementation: ${oc.select:actor_rollout_ref.actor.veomni.rms_norm_implementation,eager}
       swiglu_mlp_implementation: ${oc.select:actor_rollout_ref.actor.veomni.swiglu_mlp_implementation,eager}
       rotary_pos_emb_implementation: ${oc.select:actor_rollout_ref.actor.veomni.rotary_pos_emb_implementation,eager}
+      rms_norm_gated_implementation: eager
+      causal_conv1d_implementation: eager
+      chunk_gated_delta_rule_implementation: eager
       load_balancing_loss_implementation: ${oc.select:actor_rollout_ref.actor.veomni.load_balancing_loss_implementation,eager}
       force_use_huggingface: false
       activation_gpu_limit: 0.0
+      basic_modules: []
       moe_load_balance_monitor_interval: 0
       router_replay:
         _target_: verl.workers.config.EngineRouterReplayConfig
@@ -492,9 +506,12 @@ critic:
     init_device: meta
     enable_full_shard: true
     ckpt_manager: dcp
+    load_checkpoint_path: null
     forward_prefetch: true
     strategy: veomni
     use_torch_compile: false
+    entropy_from_logits_with_chunking: false
+    entropy_checkpointing: false
     forward_only: false
     enable_fsdp_offload: false
     enable_reentrant: false
@@ -504,9 +521,13 @@ critic:
     rms_norm_implementation: eager
     swiglu_mlp_implementation: eager
     rotary_pos_emb_implementation: eager
+    rms_norm_gated_implementation: eager
+    causal_conv1d_implementation: eager
+    chunk_gated_delta_rule_implementation: eager
     load_balancing_loss_implementation: eager
     force_use_huggingface: false
     activation_gpu_limit: 0.0
+    basic_modules: []
     moe_load_balance_monitor_interval: 0
     router_replay:
       _target_: verl.workers.config.EngineRouterReplayConfig

@@ -28,6 +28,9 @@ enable_full_shard: true
 
 ckpt_manager: dcp
 
+# Path to load checkpoint from, if any
+load_checkpoint_path: null
+
 # Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
 # before the current forward computation.
 forward_prefetch: true
@@ -37,6 +40,12 @@ strategy: veomni
 # Whether to use torch compile in fsdp.
 use_torch_compile: false
 
+# Whether to use entropy_from_logits_with_chunking in fsdp.
+entropy_from_logits_with_chunking: false
+
+# Whether to use entropy checkpointing in fsdp.
+entropy_checkpointing: false
+
 # Whether to use forward only in fsdp.
 forward_only: false
 
@@ -60,12 +69,18 @@ cross_entropy_loss_implementation: eager
 rms_norm_implementation: eager
 swiglu_mlp_implementation: eager
 rotary_pos_emb_implementation: eager
+rms_norm_gated_implementation: eager
+causal_conv1d_implementation: eager
+chunk_gated_delta_rule_implementation: eager
 load_balancing_loss_implementation: eager
 
 force_use_huggingface: false
 
 activation_gpu_limit: 0.0
 
+# List of basic modules to use
+basic_modules: []
+
 # MoE expert-load monitor interval. When > 0, attach VeOmni's MoERouterMonitor.
 # Scalar metrics flow through Tracking; heatmap images go to wandb on rank 0.
 moe_load_balance_monitor_interval: 0

diff --git a/verl/workers/config/engine.py b/verl/workers/config/engine.py
@@ -272,11 +272,9 @@ class VeOmniEngineConfig(EngineConfig):
     The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.
 
     Args:
-        wrap_policy (Dict[str, Any]): Configuration for FSDP wrap policy.
         param_offload (bool): Whether to offload parameters to CPU, default False
         optimizer_offload (bool): Whether to offload optimizer states to CPU, default False
         offload_policy (bool): Whether to offload policy model parameters, default False
-        reshard_after_forward (bool): Whether to reshard parameters after forward pass, default True
         fsdp_size (int): FSDP group size. -1 means use all available GPUs, default -1
         ulysses_parallel_size (int): Ulysses sequence parallel size, default 1
         expert_parallel_size (int): Expert parallel size, default 1
@@ -324,22 +322,33 @@ class VeOmniEngineConfig(EngineConfig):
         basic_modules (list[str]): List of basic modules to use, default None
         forward_prefetch (bool): Whether to prefetch parameters for next forward pass, default False
         model_dtype (str): Model data type used to initialize the transformers model. default "fp32"
-        use_orig_params (bool): Whether to use original parameters when initialize FSDP1, default False
         seed (int): Random seed for reproducibility.
         full_determinism (bool): If true, enable_full_determinism is called to ensure reproducible results
             in distributed training. Important: this will negatively impact performance, so only use it for
             debugging.
         mixed_precision (Optional[dict[str, Any]]): Mixed precision configuration for FSDP, default None
+        rms_norm_gated_implementation (str): Gated RMSNorm implementation (Qwen3.5 GatedDeltaNet
+            ``self.norm``). ``"fla"`` uses fla.modules.FusedRMSNormGated (requires flash-linear-attention,
+            GPU). ``"eager"`` (default) uses the HuggingFace Qwen3_5RMSNormGated. Qwen3.5 has no NPU
+            backend today — selecting any non-eager value on NPU raises at OpSlot bind time.
+        causal_conv1d_implementation (str): Varlen depthwise causal conv1d implementation (Qwen3.5
+            GatedDeltaNet pre-mixer). ``"fla"`` uses fla.modules.convolution.causal_conv1d (requires
+            flash-linear-attention, GPU). ``"eager"`` (default) leaves causal_conv1d_fn unset; the varlen
+            training path then raises because no torch fallback handles cu_seqlens. Qwen3.5 has no NPU
+            backend today — selecting any non-eager value on NPU raises at OpSlot bind time.
+        chunk_gated_delta_rule_implementation (str): Chunk gated delta-rule kernel for Qwen3.5 linear
+            attention. ``"fla"`` uses fla.ops.gated_delta_rule.chunk_gated_delta_rule (requires
+            flash-linear-attention, GPU). ``"flash_qla"`` uses QwenLM FlashQLA (requires the optional
+            flash-qla extra, Hopper SM90 only — no Ampere/Ada below or Blackwell above; SM10x wheels are
+            WIP upstream). ``"eager"`` (default) uses transformers' torch_chunk_gated_delta_rule, which
+            does NOT support cu_seqlens; varlen training therefore raises at runtime. Qwen3.5 has no NPU
+            backend today — selecting any non-eager value on NPU raises at OpSlot bind time.
 
     """
 
     _mutable_fields = EngineConfig._mutable_fields | {"attn_implementation"}
 
-    wrap_policy: dict[str, Any] = field(default_factory=dict)
-    offload_policy: bool = False
-    reshard_after_forward: bool = True
     forward_prefetch: bool = False
-    use_orig_params: bool = False
     entropy_from_logits_with_chunking: bool = False
     use_torch_compile: bool = True
     entropy_checkpointing: bool = False
@@ -367,6 +376,9 @@ class VeOmniEngineConfig(EngineConfig):
     swiglu_mlp_implementation: str = "eager"
     rotary_pos_emb_implementation: str = "eager"
     load_balancing_loss_implementation: str = "eager"
+    rms_norm_gated_implementation: str = "eager"
+    causal_conv1d_implementation: str = "eager"
+    chunk_gated_delta_rule_implementation: str = "eager"
     force_use_huggingface: bool = False
     activation_gpu_limit: float = 0.0
     basic_modules: Optional[list[str]] = field(default_factory=list)

@@ -277,6 +277,9 @@ def _build_model_optimizer(self):
             swiglu_mlp_implementation=self.engine_config.swiglu_mlp_implementation,
             rotary_pos_emb_implementation=self.engine_config.rotary_pos_emb_implementation,
             load_balancing_loss_implementation=self.engine_config.load_balancing_loss_implementation,
+            rms_norm_gated_implementation=self.engine_config.rms_norm_gated_implementation,
+            causal_conv1d_implementation=self.engine_config.causal_conv1d_implementation,
+            chunk_gated_delta_rule_implementation=self.engine_config.chunk_gated_delta_rule_implementation,
         )
 
         # Load base model with specified configuration and dtype