Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 21 additions & 0 deletions verl/trainer/config/_generated_ppo_veomni_trainer.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -35,9 +35,12 @@ actor_rollout_ref:
init_device: meta
enable_full_shard: true
ckpt_manager: dcp
load_checkpoint_path: null
forward_prefetch: true
strategy: veomni
use_torch_compile: false
entropy_from_logits_with_chunking: false
entropy_checkpointing: false
forward_only: false
enable_fsdp_offload: false
enable_reentrant: false
Expand All @@ -47,9 +50,13 @@ actor_rollout_ref:
rms_norm_implementation: eager
swiglu_mlp_implementation: eager
rotary_pos_emb_implementation: eager
rms_norm_gated_implementation: eager
causal_conv1d_implementation: eager
chunk_gated_delta_rule_implementation: eager
load_balancing_loss_implementation: eager
force_use_huggingface: false
activation_gpu_limit: 0.0
basic_modules: []
moe_load_balance_monitor_interval: 0
router_replay:
_target_: verl.workers.config.EngineRouterReplayConfig
Expand Down Expand Up @@ -205,9 +212,12 @@ actor_rollout_ref:
init_device: meta
enable_full_shard: true
ckpt_manager: dcp
load_checkpoint_path: null
forward_prefetch: true
strategy: veomni
use_torch_compile: false
entropy_from_logits_with_chunking: false
entropy_checkpointing: false
forward_only: true
enable_fsdp_offload: false
enable_reentrant: false
Expand All @@ -217,9 +227,13 @@ actor_rollout_ref:
rms_norm_implementation: ${oc.select:actor_rollout_ref.actor.veomni.rms_norm_implementation,eager}
swiglu_mlp_implementation: ${oc.select:actor_rollout_ref.actor.veomni.swiglu_mlp_implementation,eager}
rotary_pos_emb_implementation: ${oc.select:actor_rollout_ref.actor.veomni.rotary_pos_emb_implementation,eager}
rms_norm_gated_implementation: eager
causal_conv1d_implementation: eager
chunk_gated_delta_rule_implementation: eager
load_balancing_loss_implementation: ${oc.select:actor_rollout_ref.actor.veomni.load_balancing_loss_implementation,eager}
force_use_huggingface: false
activation_gpu_limit: 0.0
basic_modules: []
moe_load_balance_monitor_interval: 0
router_replay:
_target_: verl.workers.config.EngineRouterReplayConfig
Expand Down Expand Up @@ -492,9 +506,12 @@ critic:
init_device: meta
enable_full_shard: true
ckpt_manager: dcp
load_checkpoint_path: null
forward_prefetch: true
strategy: veomni
use_torch_compile: false
entropy_from_logits_with_chunking: false
entropy_checkpointing: false
forward_only: false
enable_fsdp_offload: false
enable_reentrant: false
Expand All @@ -504,9 +521,13 @@ critic:
rms_norm_implementation: eager
swiglu_mlp_implementation: eager
rotary_pos_emb_implementation: eager
rms_norm_gated_implementation: eager
causal_conv1d_implementation: eager
chunk_gated_delta_rule_implementation: eager
load_balancing_loss_implementation: eager
force_use_huggingface: false
activation_gpu_limit: 0.0
basic_modules: []
moe_load_balance_monitor_interval: 0
router_replay:
_target_: verl.workers.config.EngineRouterReplayConfig
Expand Down
15 changes: 15 additions & 0 deletions verl/trainer/config/engine/veomni.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,9 @@ enable_full_shard: true

ckpt_manager: dcp

# Path to load checkpoint from, if any
load_checkpoint_path: null

# Only for FSDP1: FSDP1 configuration, prefetch the next forward-pass all-gather
# before the current forward computation.
forward_prefetch: true
Expand All @@ -37,6 +40,12 @@ strategy: veomni
# Whether to use torch compile in fsdp.
use_torch_compile: false

# Whether to use entropy_from_logits_with_chunking in fsdp.
entropy_from_logits_with_chunking: false

# Whether to use entropy checkpointing in fsdp.
entropy_checkpointing: false

# Whether to use forward only in fsdp.
forward_only: false

Expand All @@ -60,12 +69,18 @@ cross_entropy_loss_implementation: eager
rms_norm_implementation: eager
swiglu_mlp_implementation: eager
rotary_pos_emb_implementation: eager
rms_norm_gated_implementation: eager
causal_conv1d_implementation: eager
chunk_gated_delta_rule_implementation: eager
load_balancing_loss_implementation: eager

force_use_huggingface: false

activation_gpu_limit: 0.0

# List of basic modules to use
basic_modules: []

# MoE expert-load monitor interval. When > 0, attach VeOmni's MoERouterMonitor.
# Scalar metrics flow through Tracking; heatmap images go to wandb on rank 0.
moe_load_balance_monitor_interval: 0
Expand Down
26 changes: 19 additions & 7 deletions verl/workers/config/engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -272,11 +272,9 @@ class VeOmniEngineConfig(EngineConfig):
The inheritance from BaseConfig provides omegaconf.DictConfig-like interface for a dataclass config.

Args:
wrap_policy (Dict[str, Any]): Configuration for FSDP wrap policy.
param_offload (bool): Whether to offload parameters to CPU, default False
optimizer_offload (bool): Whether to offload optimizer states to CPU, default False
offload_policy (bool): Whether to offload policy model parameters, default False
reshard_after_forward (bool): Whether to reshard parameters after forward pass, default True
fsdp_size (int): FSDP group size. -1 means use all available GPUs, default -1
ulysses_parallel_size (int): Ulysses sequence parallel size, default 1
expert_parallel_size (int): Expert parallel size, default 1
Expand Down Expand Up @@ -324,22 +322,33 @@ class VeOmniEngineConfig(EngineConfig):
basic_modules (list[str]): List of basic modules to use, default None
forward_prefetch (bool): Whether to prefetch parameters for next forward pass, default False
model_dtype (str): Model data type used to initialize the transformers model. default "fp32"
use_orig_params (bool): Whether to use original parameters when initialize FSDP1, default False
seed (int): Random seed for reproducibility.
full_determinism (bool): If true, enable_full_determinism is called to ensure reproducible results
in distributed training. Important: this will negatively impact performance, so only use it for
debugging.
mixed_precision (Optional[dict[str, Any]]): Mixed precision configuration for FSDP, default None
rms_norm_gated_implementation (str): Gated RMSNorm implementation (Qwen3.5 GatedDeltaNet
``self.norm``). ``"fla"`` uses fla.modules.FusedRMSNormGated (requires flash-linear-attention,
GPU). ``"eager"`` (default) uses the HuggingFace Qwen3_5RMSNormGated. Qwen3.5 has no NPU
backend today — selecting any non-eager value on NPU raises at OpSlot bind time.
causal_conv1d_implementation (str): Varlen depthwise causal conv1d implementation (Qwen3.5
GatedDeltaNet pre-mixer). ``"fla"`` uses fla.modules.convolution.causal_conv1d (requires
flash-linear-attention, GPU). ``"eager"`` (default) leaves causal_conv1d_fn unset; the varlen
training path then raises because no torch fallback handles cu_seqlens. Qwen3.5 has no NPU
backend today — selecting any non-eager value on NPU raises at OpSlot bind time.
chunk_gated_delta_rule_implementation (str): Chunk gated delta-rule kernel for Qwen3.5 linear
attention. ``"fla"`` uses fla.ops.gated_delta_rule.chunk_gated_delta_rule (requires
flash-linear-attention, GPU). ``"flash_qla"`` uses QwenLM FlashQLA (requires the optional
flash-qla extra, Hopper SM90 only — no Ampere/Ada below or Blackwell above; SM10x wheels are
WIP upstream). ``"eager"`` (default) uses transformers' torch_chunk_gated_delta_rule, which
does NOT support cu_seqlens; varlen training therefore raises at runtime. Qwen3.5 has no NPU
backend today — selecting any non-eager value on NPU raises at OpSlot bind time.

"""

_mutable_fields = EngineConfig._mutable_fields | {"attn_implementation"}

wrap_policy: dict[str, Any] = field(default_factory=dict)
offload_policy: bool = False
reshard_after_forward: bool = True
forward_prefetch: bool = False
use_orig_params: bool = False
entropy_from_logits_with_chunking: bool = False
use_torch_compile: bool = True
entropy_checkpointing: bool = False
Expand Down Expand Up @@ -367,6 +376,9 @@ class VeOmniEngineConfig(EngineConfig):
swiglu_mlp_implementation: str = "eager"
rotary_pos_emb_implementation: str = "eager"
load_balancing_loss_implementation: str = "eager"
rms_norm_gated_implementation: str = "eager"
causal_conv1d_implementation: str = "eager"
chunk_gated_delta_rule_implementation: str = "eager"
force_use_huggingface: bool = False
activation_gpu_limit: float = 0.0
basic_modules: Optional[list[str]] = field(default_factory=list)
Expand Down
3 changes: 3 additions & 0 deletions verl/workers/engine/veomni/transformer_impl.py
Original file line number Diff line number Diff line change
Expand Up @@ -277,6 +277,9 @@ def _build_model_optimizer(self):
swiglu_mlp_implementation=self.engine_config.swiglu_mlp_implementation,
rotary_pos_emb_implementation=self.engine_config.rotary_pos_emb_implementation,
load_balancing_loss_implementation=self.engine_config.load_balancing_loss_implementation,
rms_norm_gated_implementation=self.engine_config.rms_norm_gated_implementation,
causal_conv1d_implementation=self.engine_config.causal_conv1d_implementation,
chunk_gated_delta_rule_implementation=self.engine_config.chunk_gated_delta_rule_implementation,
)

# Load base model with specified configuration and dtype
Expand Down