diff --git a/README.md b/README.md
index 94704cc2..79c5214a 100644
--- a/README.md
+++ b/README.md
@@ -70,9 +70,10 @@ rest of the cells run as-is.
| Tutorial | Summary | Difficulty | Framework | Launch |
|---|---|---|---|---|
-| [`000_rl_basics`](tutorials/rl/000_rl_basics/000_rl_basics.ipynb) | Qwen3-4B haiku evaluation with verifiable rewards — serve, evaluate, train, compare | Beginner | `slime` |
|
-| [`001_sandboxes`](tutorials/rl/001_sandboxes/001_sandboxes.ipynb) | Code RL with Harbor hello-world and sandboxed verification | Intermediate | `slime` |
|
-| [`002_multiturn`](tutorials/rl/002_multiturn/002_multiturn.ipynb) | Multi-turn number-guessing RL with custom generate and reward functions | Intermediate | `slime` |
|
+| [`000_rl_basics`](tutorials/rl/000_rl_basics/000_rl_basics.ipynb) | Qwen3-4B haiku evaluation with verifiable rewards — serve, evaluate, train, compare | Beginner | `slime` |
|
+| [`001_sandboxes`](tutorials/rl/001_sandboxes/001_sandboxes.ipynb) | Code RL with Harbor hello-world and sandboxed verification | Intermediate | `slime` |
|
+| [`002_multiturn`](tutorials/rl/002_multiturn/002_multiturn.ipynb) | Multi-turn number-guessing RL with custom generate and reward functions | Intermediate | `slime` |
|
+| [`003_glm_gsm8k`](tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb) | GLM-4.7 (355B MoE) on GSM8K math — serve, evaluate, GRPO-train, compare | Advanced | `slime` |
|
See [`tutorials/README.md`](tutorials/README.md) for how to run the `.py`
diff --git a/modal_training_gym/__init__.py b/modal_training_gym/__init__.py
index a5e5ab6f..273084f8 100644
--- a/modal_training_gym/__init__.py
+++ b/modal_training_gym/__init__.py
@@ -10,6 +10,8 @@
"EvalConfigDurable": ("modal_training_gym.common.eval", "EvalConfigDurable"),
"EvalResult": ("modal_training_gym.common.eval", "EvalResult"),
"EvalRowResult": ("modal_training_gym.common.eval", "EvalRowResult"),
+ "GLM_4_7": ("modal_training_gym.common.models", "GLM_4_7"),
+ "GLM_4_7_Flash": ("modal_training_gym.common.models", "GLM_4_7_Flash"),
"HFModelConfiguration": (
"modal_training_gym.common.models",
"HFModelConfiguration",
@@ -60,6 +62,8 @@
"ModelConfig",
"ModelDeployment",
"MultiTurn",
+ "GLM_4_7",
+ "GLM_4_7_Flash",
"Qwen3_0_6B",
"Qwen3_1_7B",
"Qwen3_4B",
diff --git a/modal_training_gym/common/models/__init__.py b/modal_training_gym/common/models/__init__.py
index f2401e3a..8316d4ef 100644
--- a/modal_training_gym/common/models/__init__.py
+++ b/modal_training_gym/common/models/__init__.py
@@ -3,6 +3,8 @@
ModelArchitecture,
ModelConfig,
)
+from .glm_4_7 import GLM_4_7
+from .glm_4_7_flash import GLM_4_7_Flash
from .qwen3_0_6b import Qwen3_0_6B
from .qwen3_1_7b import Qwen3_1_7B
from .qwen3_4b import Qwen3_4B
@@ -12,6 +14,8 @@
from .qwen3_32b import Qwen3_32B
__all__ = [
+ "GLM_4_7",
+ "GLM_4_7_Flash",
"HFModelConfiguration",
"ModelArchitecture",
"ModelConfig",
diff --git a/modal_training_gym/common/models/base.py b/modal_training_gym/common/models/base.py
index f8123bcc..5cafad13 100644
--- a/modal_training_gym/common/models/base.py
+++ b/modal_training_gym/common/models/base.py
@@ -75,6 +75,13 @@ class ModelArchitecture:
use_rotary_position_embeddings: bool = True
rotary_base: int = 10000
+ # MoE (Mixture of Experts)
+ num_experts: int = 0
+ moe_router_topk: int = 0
+ moe_ffn_hidden_size: int = 0
+ num_shared_experts: int = 0
+ first_k_dense_replace: int = 0
+
def to_megatron_args(self) -> list[str]:
"""Generate Megatron-LM CLI flags from this architecture spec."""
args: list[str] = []
@@ -111,6 +118,16 @@ def to_megatron_args(self) -> list[str]:
args += ["--position-embedding-type", "rope"]
if self.rotary_base != 10000:
args += ["--rotary-base", str(self.rotary_base)]
+ if self.num_experts:
+ args += ["--num-experts", str(self.num_experts)]
+ if self.moe_router_topk:
+ args += ["--moe-router-topk", str(self.moe_router_topk)]
+ if self.moe_ffn_hidden_size:
+ args += ["--moe-ffn-hidden-size", str(self.moe_ffn_hidden_size)]
+ if self.num_shared_experts:
+ args += ["--num-shared-experts", str(self.num_shared_experts)]
+ if self.first_k_dense_replace:
+ args += ["--first-k-dense-replace", str(self.first_k_dense_replace)]
return args
diff --git a/modal_training_gym/common/models/glm_4_7.py b/modal_training_gym/common/models/glm_4_7.py
new file mode 100644
index 00000000..8e375a5e
--- /dev/null
+++ b/modal_training_gym/common/models/glm_4_7.py
@@ -0,0 +1,38 @@
+"""GLM-4.7 (355B-A32B MoE) model spec."""
+
+from .base import HFModelConfiguration, ModelArchitecture
+
+
+class GLM_4_7(HFModelConfiguration):
+ """GLM-4.7 (355B total, 32B active) MoE from Zhipu AI.
+
+ 160 routed experts with top-8 routing plus 1 shared expert.
+ First 3 layers are dense; remaining 89 are MoE.
+ Uses GQA (96 Q heads, 8 KV heads) with partial RoPE.
+ Downloads from ``zai-org/GLM-4.7`` on HuggingFace.
+ """
+
+ model_name = "zai-org/GLM-4.7"
+ architecture = ModelArchitecture(
+ num_layers=92,
+ hidden_size=5120,
+ ffn_hidden_size=12288,
+ num_attention_heads=96,
+ group_query_attention=True,
+ num_query_groups=8,
+ kv_channels=128,
+ vocab_size=151552,
+ normalization="RMSNorm",
+ norm_epsilon=1e-5,
+ swiglu=True,
+ disable_bias_linear=False,
+ qk_layernorm=True,
+ untie_embeddings_and_output_weights=True,
+ use_rotary_position_embeddings=True,
+ rotary_base=1000000,
+ num_experts=160,
+ moe_router_topk=8,
+ moe_ffn_hidden_size=1536,
+ num_shared_experts=1,
+ first_k_dense_replace=3,
+ )
diff --git a/modal_training_gym/common/models/glm_4_7_flash.py b/modal_training_gym/common/models/glm_4_7_flash.py
new file mode 100644
index 00000000..6bb84538
--- /dev/null
+++ b/modal_training_gym/common/models/glm_4_7_flash.py
@@ -0,0 +1,36 @@
+"""GLM-4.7-Flash (30B-A3B MoE) model spec."""
+
+from .base import HFModelConfiguration, ModelArchitecture
+
+
+class GLM_4_7_Flash(HFModelConfiguration):
+ """GLM-4.7-Flash (30B total, 3B active) MoE from Zhipu AI.
+
+ 64 routed experts with top-4 routing plus 1 shared expert.
+ Uses Multi-head Latent Attention (MLA) and multi-token prediction.
+ Downloads from ``zai-org/GLM-4.7-Flash`` on HuggingFace.
+ """
+
+ model_name = "zai-org/GLM-4.7-Flash"
+ architecture = ModelArchitecture(
+ num_layers=47,
+ hidden_size=2048,
+ ffn_hidden_size=10240,
+ num_attention_heads=20,
+ group_query_attention=False,
+ num_query_groups=20,
+ kv_channels=128,
+ vocab_size=154880,
+ normalization="RMSNorm",
+ norm_epsilon=1e-5,
+ swiglu=True,
+ disable_bias_linear=True,
+ qk_layernorm=True,
+ use_rotary_position_embeddings=True,
+ rotary_base=1000000,
+ num_experts=64,
+ moe_router_topk=4,
+ moe_ffn_hidden_size=1536,
+ num_shared_experts=1,
+ first_k_dense_replace=1,
+ )
diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py b/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py
index 173da428..faae4f75 100644
--- a/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py
+++ b/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py
@@ -1,4 +1,10 @@
from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe
+from modal_training_gym.deploy_recipes.sglang_recipe.glm_4_7 import (
+ GLM_4_7_SglangRecipe,
+)
+from modal_training_gym.deploy_recipes.sglang_recipe.glm_4_7_flash import (
+ GLM_4_7_Flash_SglangRecipe,
+)
from modal_training_gym.deploy_recipes.sglang_recipe.qwen3_0_6b import (
Qwen3_0_6b_SglangRecipe,
)
@@ -22,6 +28,8 @@
)
__all__ = [
+ "GLM_4_7_SglangRecipe",
+ "GLM_4_7_Flash_SglangRecipe",
"SglangRecipe",
"Qwen3_0_6b_SglangRecipe",
"Qwen3_1_7b_SglangRecipe",
diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py
new file mode 100644
index 00000000..84383006
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py
@@ -0,0 +1,26 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe
+
+_GLM_4_7_DEFAULTS = {
+ "gpu": "H100",
+ "tp": 8,
+ "context_length": 32768,
+ "mem_fraction_static": 0.80,
+ "chunked_prefill_size": 8192,
+ "max_running_requests": 16,
+ "extra_server_args": {"--trust-remote-code": ""},
+}
+
+
+_SGLANG_DEFAULTS = SglangRecipe()
+
+
+@dataclass
+class GLM_4_7_SglangRecipe(SglangRecipe):
+ """GLM-4.7 (355B) on 8×H100 — tensor-parallel MoE serving."""
+
+ def __post_init__(self) -> None:
+ for key, val in _GLM_4_7_DEFAULTS.items():
+ if getattr(self, key) == getattr(_SGLANG_DEFAULTS, key):
+ object.__setattr__(self, key, val)
diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py
new file mode 100644
index 00000000..d248b63a
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe
+
+_GLM_4_7_FLASH_DEFAULTS = {
+ "gpu": "H100",
+ "tp": 1,
+ "dp": 8,
+ "context_length": 32768,
+ "mem_fraction_static": 0.80,
+ "chunked_prefill_size": 8192,
+ "max_running_requests": 16,
+ "extra_server_args": {"--trust-remote-code": ""},
+}
+
+
+_SGLANG_DEFAULTS = SglangRecipe()
+
+
+@dataclass
+class GLM_4_7_Flash_SglangRecipe(SglangRecipe):
+ """GLM-4.7-Flash on 8×H100 — DP-attention MoE serving."""
+
+ def __post_init__(self) -> None:
+ for key, val in _GLM_4_7_FLASH_DEFAULTS.items():
+ if getattr(self, key) == getattr(_SGLANG_DEFAULTS, key):
+ object.__setattr__(self, key, val)
diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py b/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py
index cdcb8a99..34091d67 100644
--- a/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py
+++ b/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py
@@ -1,4 +1,10 @@
from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe
+from modal_training_gym.deploy_recipes.vllm_recipe.glm_4_7 import (
+ GLM_4_7_VllmRecipe,
+)
+from modal_training_gym.deploy_recipes.vllm_recipe.glm_4_7_flash import (
+ GLM_4_7_Flash_VllmRecipe,
+)
from modal_training_gym.deploy_recipes.vllm_recipe.qwen3_0_6b import (
Qwen3_0_6b_VllmRecipe,
)
@@ -12,6 +18,8 @@
from modal_training_gym.deploy_recipes.vllm_recipe.qwen3_32b import Qwen3_32b_VllmRecipe
__all__ = [
+ "GLM_4_7_VllmRecipe",
+ "GLM_4_7_Flash_VllmRecipe",
"VllmRecipe",
"Qwen3_0_6b_VllmRecipe",
"Qwen3_1_7b_VllmRecipe",
diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py
new file mode 100644
index 00000000..09010cf5
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe
+
+_GLM_4_7_DEFAULTS = {
+ "gpu": "H100",
+ "n_gpu": 8,
+ "extra_vllm_args": ["--trust-remote-code"],
+}
+
+_VLLM_DEFAULTS = VllmRecipe()
+
+
+@dataclass
+class GLM_4_7_VllmRecipe(VllmRecipe):
+ """GLM-4.7 (355B) on 8×H100 — tensor-parallel MoE serving."""
+
+ def __post_init__(self) -> None:
+ for key, val in _GLM_4_7_DEFAULTS.items():
+ if getattr(self, key) == getattr(_VLLM_DEFAULTS, key):
+ object.__setattr__(self, key, val)
diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py
new file mode 100644
index 00000000..50cc52c4
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe
+
+_GLM_4_7_FLASH_DEFAULTS = {
+ "gpu": "H100",
+ "n_gpu": 2,
+ "extra_vllm_args": ["--trust-remote-code"],
+}
+
+_VLLM_DEFAULTS = VllmRecipe()
+
+
+@dataclass
+class GLM_4_7_Flash_VllmRecipe(VllmRecipe):
+ """GLM-4.7-Flash on 2×H100 — tensor-parallel MoE serving."""
+
+ def __post_init__(self) -> None:
+ for key, val in _GLM_4_7_FLASH_DEFAULTS.items():
+ if getattr(self, key) == getattr(_VLLM_DEFAULTS, key):
+ object.__setattr__(self, key, val)
diff --git a/modal_training_gym/train_recipes/slime_recipe/__init__.py b/modal_training_gym/train_recipes/slime_recipe/__init__.py
index 2c8e5635..7487668b 100644
--- a/modal_training_gym/train_recipes/slime_recipe/__init__.py
+++ b/modal_training_gym/train_recipes/slime_recipe/__init__.py
@@ -3,6 +3,10 @@
SlimeRecipeBlock,
)
from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe
+from modal_training_gym.train_recipes.slime_recipe.glm_4_7 import GLM_4_7_Recipe
+from modal_training_gym.train_recipes.slime_recipe.glm_4_7_flash import (
+ GLM_4_7_Flash_Recipe,
+)
from modal_training_gym.train_recipes.slime_recipe.qwen3_1_7b import Qwen3_1_7b_Recipe
from modal_training_gym.train_recipes.slime_recipe.qwen3_8b import Qwen3_8b_Recipe
from modal_training_gym.train_recipes.slime_recipe.qwen3_14b import Qwen3_14b_Recipe
@@ -10,6 +14,8 @@
from modal_training_gym.train_recipes.slime_recipe.qwen3_4b import Qwen3_4b_Recipe
__all__ = [
+ "GLM_4_7_Recipe",
+ "GLM_4_7_Flash_Recipe",
"MultiTurn",
"SlimeRecipe",
"SlimeRecipeBlock",
diff --git a/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py b/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py
new file mode 100644
index 00000000..2de0060a
--- /dev/null
+++ b/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py
@@ -0,0 +1,51 @@
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe
+
+
+@dataclass(config=ConfigDict(extra="forbid", arbitrary_types_allowed=True))
+class GLM_4_7_Recipe(SlimeRecipe):
+ """GLM-4.7 (355B-A32B MoE) on 8×8×H100, colocated GRPO.
+
+ TP=8, PP=4, CP=2, EP=16 across 8 nodes (64 GPUs).
+ Uses CPU optimizer offloading for the large parameter count.
+ """
+
+ gpu_type: str = "H100"
+ colocate: bool = True
+ actor_num_nodes: int = 8
+ actor_num_gpus_per_node: int = 8
+ tensor_model_parallel_size: int = 8
+ sequence_parallel: bool = True
+ rollout_num_gpus_per_engine: int = 32
+
+ # MoE parallelism
+ expert_model_parallel_size: int = 16
+ expert_tensor_parallel_size: int = 1
+ pipeline_model_parallel_size: int = 4
+ context_parallel_size: int = 2
+ attention_backend: str | None = "flash"
+
+ # Rollout
+ num_rollout: int = 1
+ rollout_batch_size: int = 64
+ rollout_max_response_len: int = 4096
+ rollout_temperature: float = 1.0
+ sglang_mem_fraction_static: float = 0.70
+
+ save_interval: int = 10
+
+ # Training
+ n_samples_per_prompt: int = 8
+ global_batch_size: int = 512
+ lr: float = 1e-6
+ max_tokens_per_gpu: int = 16384
+
+ # Optimizer offloading (required for 355B model)
+ optimizer_cpu_offload: bool = True
+ overlap_cpu_optimizer_d2h_h2d: bool = True
+ use_precision_aware_optimizer: bool = True
+
+ eval_interval: int | None = 10
+ eval_max_response_len: int = 4096
diff --git a/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py b/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py
new file mode 100644
index 00000000..90ee6745
--- /dev/null
+++ b/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py
@@ -0,0 +1,56 @@
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe
+
+
+@dataclass(config=ConfigDict(extra="forbid", arbitrary_types_allowed=True))
+class GLM_4_7_Flash_Recipe(SlimeRecipe):
+ """GLM-4.7-Flash (30B-A3B MoE) on 1×8×H100, colocated GRPO.
+
+ TP=1, PP=1, EP=8 fits on a single 8-GPU node.
+ Uses MTP training, DeepEP, and CPU optimizer offloading.
+ """
+
+ gpu_type: str = "H100"
+ colocate: bool = True
+ tensor_model_parallel_size: int = 1
+ sequence_parallel: bool = False
+ rollout_num_gpus_per_engine: int = 8
+
+ # MoE parallelism
+ expert_model_parallel_size: int = 8
+ expert_tensor_parallel_size: int = 1
+ pipeline_model_parallel_size: int = 1
+ context_parallel_size: int = 1
+ moe_token_dispatcher_type: str | None = "flex"
+ moe_enable_deepep: bool = True
+
+ # MTP
+ enable_mtp_training: bool = True
+ mtp_num_layers: int = 1
+ mtp_loss_scaling_factor: float = 0.2
+
+ # Rollout
+ num_rollout: int = 1
+ rollout_batch_size: int = 64
+ rollout_max_response_len: int = 4096
+ rollout_temperature: float = 1.0
+ sglang_mem_fraction_static: float = 0.70
+
+ save_interval: int = 10
+
+ # Training
+ n_samples_per_prompt: int = 8
+ global_batch_size: int = 512
+ lr: float = 1e-6
+ max_tokens_per_gpu: int = 32768
+ attention_backend: str | None = "flash"
+
+ # Optimizer offloading (MoE models are memory-heavy)
+ optimizer_cpu_offload: bool = True
+ overlap_cpu_optimizer_d2h_h2d: bool = True
+ use_precision_aware_optimizer: bool = True
+
+ eval_interval: int | None = 10
+ eval_max_response_len: int = 4096
diff --git a/modal_training_gym/train_recipes/slime_recipe/recipe.py b/modal_training_gym/train_recipes/slime_recipe/recipe.py
index 46ad418f..a68bc8a7 100644
--- a/modal_training_gym/train_recipes/slime_recipe/recipe.py
+++ b/modal_training_gym/train_recipes/slime_recipe/recipe.py
@@ -115,6 +115,25 @@ class SlimeRecipe(BaseTrainRecipe):
adam_beta2: float = 0.98
optimizer: str = "adam"
+ # ── MoE parallelism ──────────────────────────────────────────────────────
+ expert_model_parallel_size: int = 1
+ expert_tensor_parallel_size: int = 1
+ pipeline_model_parallel_size: int = 1
+ context_parallel_size: int = 1
+ moe_token_dispatcher_type: str | None = None
+ moe_enable_deepep: bool = False
+
+ # ── Multi-token prediction (MTP) ─────────────────────────────────────
+ enable_mtp_training: bool = False
+ mtp_num_layers: int = 0
+ mtp_loss_scaling_factor: float = 0.0
+
+ # ── Optimizer offloading ──────────────────────────────────────────────
+ optimizer_cpu_offload: bool = False
+ overlap_cpu_optimizer_d2h_h2d: bool = False
+ use_precision_aware_optimizer: bool = False
+ attention_backend: str | None = None
+
# ── Memory and precision ────────────────────────────────────────────────
attention_dropout: float = 0.0
hidden_dropout: float = 0.0
@@ -252,7 +271,7 @@ def _validate_dataset(ds: "DatasetConfig") -> None:
@staticmethod
def _model_to_fields(m: "ModelConfig") -> dict[str, Any]:
arch = SlimeRecipe._validate_custom_model_architecture(m)
- return {
+ fields = {
"hf_checkpoint": m.model_path or m.model_name,
"num_layers": arch.num_layers,
"hidden_size": arch.hidden_size,
@@ -271,6 +290,17 @@ def _model_to_fields(m: "ModelConfig") -> dict[str, Any]:
"use_rotary_position_embeddings": arch.use_rotary_position_embeddings,
"rotary_base": arch.rotary_base,
}
+ if arch.num_experts:
+ fields["num_experts"] = arch.num_experts
+ if arch.moe_router_topk:
+ fields["moe_router_topk"] = arch.moe_router_topk
+ if arch.moe_ffn_hidden_size:
+ fields["moe_ffn_hidden_size"] = arch.moe_ffn_hidden_size
+ if arch.num_shared_experts:
+ fields["num_shared_experts"] = arch.num_shared_experts
+ if arch.first_k_dense_replace:
+ fields["first_k_dense_replace"] = arch.first_k_dense_replace
+ return fields
@staticmethod
def _wandb_to_fields(w: "WandbConfig") -> dict[str, Any]:
@@ -388,4 +418,16 @@ def get_base_recipe(cls, model_config: ModelConfig) -> "SlimeRecipe | None":
return Qwen3_14b_Recipe()
if model_config.model_name == "Qwen/Qwen3-32B":
return Qwen3_32b_Recipe()
+ if model_config.model_name == "zai-org/GLM-4.7":
+ from modal_training_gym.train_recipes.slime_recipe.glm_4_7 import (
+ GLM_4_7_Recipe,
+ )
+
+ return GLM_4_7_Recipe()
+ if model_config.model_name == "zai-org/GLM-4.7-Flash":
+ from modal_training_gym.train_recipes.slime_recipe.glm_4_7_flash import (
+ GLM_4_7_Flash_Recipe,
+ )
+
+ return GLM_4_7_Flash_Recipe()
return None
diff --git a/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb
new file mode 100644
index 00000000..e9c79f21
--- /dev/null
+++ b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb
@@ -0,0 +1,434 @@
+{
+ "cells": [
+ {
+ "cell_type": "markdown",
+ "id": "cell-000",
+ "metadata": {},
+ "source": [
+ "# Training GLM-4.7 on GSM8K\n",
+ "\n",
+ "This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7),\n",
+ "a 355B-parameter Mixture-of-Experts model (32B active per token),\n",
+ "on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k).\n",
+ "\n",
+ "GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing.\n",
+ "The first 3 layers are dense; the remaining 89 use sparse expert routing.\n",
+ "Training a model this size requires multi-node parallelism:\n",
+ "\n",
+ "1. Serving the base model with SGLang on 8\u00d7H100.\n",
+ "2. Loading the GSM8K dataset and defining a math-answer scorer.\n",
+ "3. Evaluating the base model.\n",
+ "4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward.\n",
+ "5. Evaluating the trained checkpoint and comparing.\n",
+ "\n",
+ "**Cluster shape:** 8 nodes \u00d7 8 H100 GPUs (64 GPUs total). Training uses\n",
+ "TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-001",
+ "metadata": {},
+ "source": [
+ "## Prerequisites\n",
+ "\n",
+ "This tutorial requires a Modal Secret named `huggingface-secret` containing your\n",
+ "`HF_TOKEN`. Create one at [modal.com/secrets](https://modal.com/secrets) if you\n",
+ "haven't already \u2014 the cell below fails fast with instructions otherwise."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-002",
+ "metadata": {},
+ "source": [
+ "> **Note:** you do **not** need to attach a GPU to this notebook. All training and\n",
+ "> serving happens on Modal-managed GPU workers spun up by the SDK \u2014 the notebook\n",
+ "> itself only needs to issue API calls."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-003",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import modal\n",
+ "\n",
+ "try:\n",
+ " modal.Secret.from_name(\"huggingface-secret\").hydrate()\n",
+ "except modal.exception.NotFoundError as e:\n",
+ " raise RuntimeError(\n",
+ " \"Missing Modal Secret 'huggingface-secret'. Create one at \"\n",
+ " \"https://modal.com/secrets with an HF_TOKEN entry, then re-run.\"\n",
+ " ) from e"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-004",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "%uv pip install -q git+https://github.com/modal-projects/training-gym.git@main"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-005",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import re\n",
+ "\n",
+ "from modal_training_gym import (\n",
+ " DeploymentConfig,\n",
+ " EvalConfig,\n",
+ " EvalRowResult,\n",
+ " GLM_4_7,\n",
+ " HuggingFaceDataset,\n",
+ " SlimeRecipe,\n",
+ " TrainConfig,\n",
+ " list_checkpoints,\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-006",
+ "metadata": {},
+ "source": [
+ "## Serve the base model\n",
+ "\n",
+ "First, let's deploy GLM-4.7 so we can test it.\n",
+ "The 355B model needs 8\u00d7H100 GPUs for serving with tensor parallelism."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-007",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "base_model = GLM_4_7()\n",
+ "base_model_deployment = DeploymentConfig(\n",
+ " model=base_model,\n",
+ ").serve()\n",
+ "print(f\"Base model deployed to {base_model_deployment.url}\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-008",
+ "metadata": {},
+ "source": [
+ "Let's try asking it a math question to see how it responds."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-009",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "response = base_model_deployment.generate(\n",
+ " \"What is 24 * 37?\",\n",
+ " chat_template_kwargs={\"enable_thinking\": False},\n",
+ ")\n",
+ "print(response)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-010",
+ "metadata": {},
+ "source": [
+ "## Define the GSM8K dataset\n",
+ "\n",
+ "GSM8K contains grade-school math word problems. Each row has a `question`\n",
+ "and an `answer` that ends with `#### ` \u2014 the final numerical answer.\n",
+ "\n",
+ "We'll use the answer extraction pattern to score the model: extract the\n",
+ "number after `####` from the label and compare it to the model's final number."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-011",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "class GSM8KDataset(HuggingFaceDataset):\n",
+ " hf_repo = \"openai/gsm8k\"\n",
+ " hf_config = \"main\"\n",
+ " input_column = \"question\"\n",
+ " output_column = \"answer\"\n",
+ " output_format = \"jsonl\"\n",
+ " apply_chat_template = True\n",
+ " system_prompt = (\n",
+ " \"You are a math problem solver. Solve the given math problem \"\n",
+ " \"step by step, then give your final answer as a number on the \"\n",
+ " \"last line prefixed with ####. For example: #### 42\"\n",
+ " )\n",
+ " always_prepare = True\n",
+ "\n",
+ "train_dataset = GSM8KDataset(n_rows=50)\n",
+ "eval_dataset = GSM8KDataset(n_rows=20, hf_split=\"test\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-012",
+ "metadata": {},
+ "source": [
+ "Let's look at a few examples from the dataset."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-013",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "df = eval_dataset.to_pandas()\n",
+ "print(f\"Eval set: {len(df)} rows\")\n",
+ "df.head(3)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-014",
+ "metadata": {},
+ "source": [
+ "## Define the scoring function\n",
+ "\n",
+ "GSM8K answers end with `#### `. We extract the number from both\n",
+ "the label and the model's response, then check if they match."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-015",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def extract_answer(text: str) -> str | None:\n",
+ " match = re.search(r\"####\\s*(-?[\\d,]+(?:\\.\\d+)?)\", text)\n",
+ " if match:\n",
+ " return match.group(1).replace(\",\", \"\").strip()\n",
+ " numbers = re.findall(r\"-?[\\d,]+(?:\\.\\d+)?\", text)\n",
+ " if numbers:\n",
+ " return numbers[-1].replace(\",\", \"\").strip()\n",
+ " return None\n",
+ "\n",
+ "def gsm8k_score(example: dict, response: str) -> EvalRowResult:\n",
+ " expected = extract_answer(example.get(\"answer\", \"\"))\n",
+ " predicted = extract_answer(response)\n",
+ " correct = expected is not None and predicted is not None and expected == predicted\n",
+ " return EvalRowResult(\n",
+ " score=1.0 if correct else 0.0,\n",
+ " response=response,\n",
+ " )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-016",
+ "metadata": {},
+ "source": [
+ "## Evaluate the base model"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-017",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "eval_config = EvalConfig(\n",
+ " dataset=eval_dataset,\n",
+ " eval_response_fn=gsm8k_score,\n",
+ " generate_kwargs={\"chat_template_kwargs\": {\"enable_thinking\": False}},\n",
+ ")\n",
+ "print(\"--- Running base model evaluation... ---\")\n",
+ "base_eval = eval_config.evaluate(base_model_deployment, debug=True)\n",
+ "print(f\"Base model GSM8K accuracy: {base_eval.mean:.1%}\")\n",
+ "print(\"--- Base model evaluation complete ---\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-018",
+ "metadata": {},
+ "source": [
+ "## Train with slime\n",
+ "\n",
+ "Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math`\n",
+ "reward type. This reward extracts the final number from the model's\n",
+ "response and checks it against the label \u2014 the same logic as our eval\n",
+ "scoring function.\n",
+ "\n",
+ "Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism:\n",
+ "- **TP=8:** tensor parallelism across all GPUs in a node.\n",
+ "- **PP=4:** pipeline parallelism across 4 stages.\n",
+ "- **EP=16:** expert parallelism shards the 160 experts across 16 groups.\n",
+ "- **CP=2:** context parallelism for long sequences.\n",
+ "- **CPU optimizer offload:** frees GPU memory for the large parameter count."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-019",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "training_run = TrainConfig(\n",
+ " model=base_model,\n",
+ " dataset=train_dataset,\n",
+ " recipe=SlimeRecipe(\n",
+ " rm_type=\"math\",\n",
+ "\n",
+ " gpu_type=\"H100\",\n",
+ " colocate=True,\n",
+ " actor_num_nodes=8,\n",
+ " actor_num_gpus_per_node=8,\n",
+ " tensor_model_parallel_size=8,\n",
+ " sequence_parallel=True,\n",
+ " rollout_num_gpus_per_engine=32,\n",
+ "\n",
+ " expert_model_parallel_size=16,\n",
+ " expert_tensor_parallel_size=1,\n",
+ " pipeline_model_parallel_size=4,\n",
+ " context_parallel_size=2,\n",
+ " attention_backend=\"flash\",\n",
+ "\n",
+ " num_rollout=10,\n",
+ " rollout_batch_size=64,\n",
+ " rollout_max_response_len=4096,\n",
+ " rollout_temperature=1.0,\n",
+ " sglang_mem_fraction_static=0.70,\n",
+ "\n",
+ " n_samples_per_prompt=8,\n",
+ " global_batch_size=512,\n",
+ " max_tokens_per_gpu=16384,\n",
+ "\n",
+ " optimizer_cpu_offload=True,\n",
+ " overlap_cpu_optimizer_d2h_h2d=True,\n",
+ " use_precision_aware_optimizer=True,\n",
+ "\n",
+ " save_interval=5,\n",
+ " apply_chat_template_kwargs='{\"enable_thinking\": false}',\n",
+ " ),\n",
+ ")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-020",
+ "metadata": {},
+ "source": [
+ "## Launch training\n",
+ "\n",
+ "`TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster\n",
+ "(8 nodes \u00d7 8 H100), runs GRPO training, and returns a `TrainResult`."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-021",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"--- Running training... ---\")\n",
+ "train_result = training_run.train()\n",
+ "print(\"--- Training complete ---\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-022",
+ "metadata": {},
+ "source": [
+ "## Serve and evaluate the trained checkpoint\n",
+ "\n",
+ "Let's deploy the trained checkpoint and run the same GSM8K eval."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-023",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "checkpoint = list_checkpoints(train_result.training_run_id)[-1]\n",
+ "print(f\"Checkpoint: {checkpoint.path}\")\n",
+ "\n",
+ "trained_model_deployment = DeploymentConfig(\n",
+ " model=GLM_4_7(),\n",
+ " checkpoint=checkpoint,\n",
+ " app_name=\"glm-4.7-gsm8k-serve\",\n",
+ " served_model_name=\"glm-4.7-gsm8k\",\n",
+ ").serve()\n",
+ "print(f\"Trained model deployed to {trained_model_deployment.url}\")"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-024",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(\"--- Running trained model evaluation... ---\")\n",
+ "trained_eval = eval_config.evaluate(trained_model_deployment, debug=True)\n",
+ "print(f\"Trained model GSM8K accuracy: {trained_eval.mean:.1%}\")\n",
+ "print(\"--- Trained model evaluation complete ---\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "cell-025",
+ "metadata": {},
+ "source": [
+ "## Compare results"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "cell-026",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "print(f\"Base model GSM8K accuracy: {base_eval.mean:.1%}\")\n",
+ "print(f\"Trained model GSM8K accuracy: {trained_eval.mean:.1%}\")\n",
+ "improvement = trained_eval.mean - base_eval.mean\n",
+ "print(f\"Improvement: {improvement:+.1%}\")"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "Python 3",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "name": "python"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
new file mode 100644
index 00000000..55bf855b
--- /dev/null
+++ b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
@@ -0,0 +1,223 @@
+# Generated by generate_tutorial.py — do not edit directly.
+# Source: tutorials/tutorial_generator/rl/003_glm_gsm8k.py
+
+# # Training GLM-4.7 on GSM8K
+#
+# This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7),
+# a 355B-parameter Mixture-of-Experts model (32B active per token),
+# on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k).
+#
+# GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing.
+# The first 3 layers are dense; the remaining 89 use sparse expert routing.
+# Training a model this size requires multi-node parallelism:
+#
+# 1. Serving the base model with SGLang on 8×H100.
+# 2. Loading the GSM8K dataset and defining a math-answer scorer.
+# 3. Evaluating the base model.
+# 4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward.
+# 5. Evaluating the trained checkpoint and comparing.
+#
+# **Cluster shape:** 8 nodes × 8 H100 GPUs (64 GPUs total). Training uses
+# TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs.
+# To run the tutorial, run the following command:
+# ```
+# uv run python tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
+# ```
+# ## Prerequisites
+#
+# This tutorial requires a Modal Secret named `huggingface-secret` containing your
+# `HF_TOKEN`. Create one at [modal.com/secrets](https://modal.com/secrets) if you
+# haven't already — the cell below fails fast with instructions otherwise.
+
+import modal
+
+import re
+
+from modal_training_gym import (
+ DeploymentConfig,
+ EvalConfig,
+ EvalRowResult,
+ GLM_4_7,
+ HuggingFaceDataset,
+ SlimeRecipe,
+ TrainConfig,
+ list_checkpoints,
+)
+
+# ## Define the GSM8K dataset
+#
+# GSM8K contains grade-school math word problems. Each row has a `question`
+# and an `answer` that ends with `#### ` — the final numerical answer.
+#
+# We'll use the answer extraction pattern to score the model: extract the
+# number after `####` from the label and compare it to the model's final number.
+
+class GSM8KDataset(HuggingFaceDataset):
+ hf_repo = "openai/gsm8k"
+ hf_config = "main"
+ input_column = "question"
+ output_column = "answer"
+ output_format = "jsonl"
+ apply_chat_template = True
+ system_prompt = (
+ "You are a math problem solver. Solve the given math problem "
+ "step by step, then give your final answer as a number on the "
+ "last line prefixed with ####. For example: #### 42"
+ )
+ always_prepare = True
+
+# ## Define the scoring function
+#
+# GSM8K answers end with `#### `. We extract the number from both
+# the label and the model's response, then check if they match.
+
+def extract_answer(text: str) -> str | None:
+ match = re.search(r"####\s*(-?[\d,]+(?:\.\d+)?)", text)
+ if match:
+ return match.group(1).replace(",", "").strip()
+ numbers = re.findall(r"-?[\d,]+(?:\.\d+)?", text)
+ if numbers:
+ return numbers[-1].replace(",", "").strip()
+ return None
+
+def gsm8k_score(example: dict, response: str) -> EvalRowResult:
+ expected = extract_answer(example.get("answer", ""))
+ predicted = extract_answer(response)
+ correct = expected is not None and predicted is not None and expected == predicted
+ return EvalRowResult(
+ score=1.0 if correct else 0.0,
+ response=response,
+ )
+
+import modal
+
+tutorial_cli_app = modal.App()
+
+def _main_impl() -> None:
+ try:
+ modal.Secret.from_name("huggingface-secret").hydrate()
+ except modal.exception.NotFoundError as e:
+ raise RuntimeError(
+ "Missing Modal Secret 'huggingface-secret'. Create one at "
+ "https://modal.com/secrets with an HF_TOKEN entry, then re-run."
+ ) from e
+
+ # ## Serve the base model
+ #
+ # First, let's deploy GLM-4.7 so we can test it.
+ # The 355B model needs 8×H100 GPUs for serving with tensor parallelism.
+
+ base_model = GLM_4_7()
+ base_model_deployment = DeploymentConfig(
+ model=base_model,
+ ).serve()
+ print(f"Base model deployed to {base_model_deployment.url}")
+
+ train_dataset = GSM8KDataset(n_rows=50)
+ eval_dataset = GSM8KDataset(n_rows=20, hf_split="test")
+
+ eval_config = EvalConfig(
+ dataset=eval_dataset,
+ eval_response_fn=gsm8k_score,
+ generate_kwargs={"chat_template_kwargs": {"enable_thinking": False}},
+ )
+ print("--- Running base model evaluation... ---")
+ base_eval = eval_config.evaluate(base_model_deployment, debug=True)
+ print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}")
+ print("--- Base model evaluation complete ---")
+
+ # ## Train with slime
+ #
+ # Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math`
+ # reward type. This reward extracts the final number from the model's
+ # response and checks it against the label — the same logic as our eval
+ # scoring function.
+ #
+ # Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism:
+ # - **TP=8:** tensor parallelism across all GPUs in a node.
+ # - **PP=4:** pipeline parallelism across 4 stages.
+ # - **EP=16:** expert parallelism shards the 160 experts across 16 groups.
+ # - **CP=2:** context parallelism for long sequences.
+ # - **CPU optimizer offload:** frees GPU memory for the large parameter count.
+
+ training_run = TrainConfig(
+ model=base_model,
+ dataset=train_dataset,
+ recipe=SlimeRecipe(
+ rm_type="math",
+
+ gpu_type="H100",
+ colocate=True,
+ actor_num_nodes=8,
+ actor_num_gpus_per_node=8,
+ tensor_model_parallel_size=8,
+ sequence_parallel=True,
+ rollout_num_gpus_per_engine=32,
+
+ expert_model_parallel_size=16,
+ expert_tensor_parallel_size=1,
+ pipeline_model_parallel_size=4,
+ context_parallel_size=2,
+ attention_backend="flash",
+
+ num_rollout=10,
+ rollout_batch_size=64,
+ rollout_max_response_len=4096,
+ rollout_temperature=1.0,
+ sglang_mem_fraction_static=0.70,
+
+ n_samples_per_prompt=8,
+ global_batch_size=512,
+ max_tokens_per_gpu=16384,
+
+ optimizer_cpu_offload=True,
+ overlap_cpu_optimizer_d2h_h2d=True,
+ use_precision_aware_optimizer=True,
+
+ save_interval=5,
+ apply_chat_template_kwargs='{"enable_thinking": false}',
+ ),
+ )
+
+ # ## Launch training
+ #
+ # `TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster
+ # (8 nodes × 8 H100), runs GRPO training, and returns a `TrainResult`.
+
+ print("--- Running training... ---")
+ train_result = training_run.train()
+ print("--- Training complete ---")
+
+ # ## Serve and evaluate the trained checkpoint
+ #
+ # Let's deploy the trained checkpoint and run the same GSM8K eval.
+
+ checkpoint = list_checkpoints(train_result.training_run_id)[-1]
+ print(f"Checkpoint: {checkpoint.path}")
+
+ trained_model_deployment = DeploymentConfig(
+ model=GLM_4_7(),
+ checkpoint=checkpoint,
+ app_name="glm-4.7-gsm8k-serve",
+ served_model_name="glm-4.7-gsm8k",
+ ).serve()
+ print(f"Trained model deployed to {trained_model_deployment.url}")
+
+ print("--- Running trained model evaluation... ---")
+ trained_eval = eval_config.evaluate(trained_model_deployment, debug=True)
+ print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+ print("--- Trained model evaluation complete ---")
+
+ # ## Compare results
+
+ print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}")
+ print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+ improvement = trained_eval.mean - base_eval.mean
+ print(f"Improvement: {improvement:+.1%}")
+
+@tutorial_cli_app.local_entrypoint()
+def main() -> None:
+ _main_impl()
+
+if __name__ == "__main__":
+ main()
diff --git a/tutorials/tutorial_generator/rl/003_glm_gsm8k.py b/tutorials/tutorial_generator/rl/003_glm_gsm8k.py
new file mode 100644
index 00000000..c69081cf
--- /dev/null
+++ b/tutorials/tutorial_generator/rl/003_glm_gsm8k.py
@@ -0,0 +1,340 @@
+"""Tutorial source for `003_glm_gsm8k` — parsed by generate_tutorial.py."""
+
+TUTORIAL_METADATA = {
+ "framework": "`slime`",
+ "cluster_shape": "8 × 8×H100",
+ "summary": "GLM-4.7 (355B MoE) on GSM8K math — serve, evaluate, GRPO-train, compare",
+ "difficulty": "Advanced",
+ "order": 30,
+ "api_classes": [
+ "GLM_4_7",
+ "DeploymentConfig",
+ "EvalConfig",
+ "EvalRowResult",
+ "TrainConfig",
+ "SlimeRecipe",
+ "TrainResult",
+ ],
+}
+
+
+from tutorial_generator import code, markdown, notebook_only, py_only, shell
+
+
+@markdown
+def _intro():
+ """
+ # Training GLM-4.7 on GSM8K
+
+ This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7),
+ a 355B-parameter Mixture-of-Experts model (32B active per token),
+ on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k).
+
+ GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing.
+ The first 3 layers are dense; the remaining 89 use sparse expert routing.
+ Training a model this size requires multi-node parallelism:
+
+ 1. Serving the base model with SGLang on 8×H100.
+ 2. Loading the GSM8K dataset and defining a math-answer scorer.
+ 3. Evaluating the base model.
+ 4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward.
+ 5. Evaluating the trained checkpoint and comparing.
+
+ **Cluster shape:** 8 nodes × 8 H100 GPUs (64 GPUs total). Training uses
+ TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs.
+ """
+
+
+@py_only
+@markdown
+def run_instructions():
+ """
+ To run the tutorial, run the following command:
+ ```
+ uv run python tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
+ ```
+ """
+
+
+@notebook_only
+@shell("%uv pip install -q git+https://github.com/modal-projects/training-gym.git@main")
+def _install():
+ pass
+
+
+@code
+def _imports():
+ import re
+
+ from modal_training_gym import (
+ DeploymentConfig,
+ EvalConfig,
+ EvalRowResult,
+ GLM_4_7,
+ HuggingFaceDataset,
+ SlimeRecipe,
+ TrainConfig,
+ list_checkpoints,
+ )
+
+
+@markdown
+def _serve_base_intro():
+ """
+ ## Serve the base model
+
+ First, let's deploy GLM-4.7 so we can test it.
+ The 355B model needs 8×H100 GPUs for serving with tensor parallelism.
+ """
+
+
+@code
+def _serve_base_model():
+ base_model = GLM_4_7()
+ base_model_deployment = DeploymentConfig(
+ model=base_model,
+ ).serve()
+ print(f"Base model deployed to {base_model_deployment.url}")
+
+
+@notebook_only
+@markdown
+def _try_base_model():
+ """
+ Let's try asking it a math question to see how it responds.
+ """
+
+
+@notebook_only
+@code
+def _try_base_model_code():
+ response = base_model_deployment.generate(
+ "What is 24 * 37?",
+ chat_template_kwargs={"enable_thinking": False},
+ )
+ print(response)
+
+
+@markdown
+def _dataset_intro():
+ """
+ ## Define the GSM8K dataset
+
+ GSM8K contains grade-school math word problems. Each row has a `question`
+ and an `answer` that ends with `#### ` — the final numerical answer.
+
+ We'll use the answer extraction pattern to score the model: extract the
+ number after `####` from the label and compare it to the model's final number.
+ """
+
+
+@code
+def _define_dataset():
+ class GSM8KDataset(HuggingFaceDataset):
+ hf_repo = "openai/gsm8k"
+ hf_config = "main"
+ input_column = "question"
+ output_column = "answer"
+ output_format = "jsonl"
+ apply_chat_template = True
+ system_prompt = (
+ "You are a math problem solver. Solve the given math problem "
+ "step by step, then give your final answer as a number on the "
+ "last line prefixed with ####. For example: #### 42"
+ )
+ always_prepare = True
+
+ train_dataset = GSM8KDataset(n_rows=50)
+ eval_dataset = GSM8KDataset(n_rows=20, hf_split="test")
+
+
+@notebook_only
+@markdown
+def _peek_dataset():
+ """
+ Let's look at a few examples from the dataset.
+ """
+
+
+@notebook_only
+@code
+def _peek_dataset_code():
+ df = eval_dataset.to_pandas()
+ print(f"Eval set: {len(df)} rows")
+ df.head(3)
+
+
+@markdown
+def _scoring_intro():
+ """
+ ## Define the scoring function
+
+ GSM8K answers end with `#### `. We extract the number from both
+ the label and the model's response, then check if they match.
+ """
+
+
+@code
+def _define_scoring():
+ def extract_answer(text: str) -> str | None:
+ match = re.search(r"####\s*(-?[\d,]+(?:\.\d+)?)", text)
+ if match:
+ return match.group(1).replace(",", "").strip()
+ numbers = re.findall(r"-?[\d,]+(?:\.\d+)?", text)
+ if numbers:
+ return numbers[-1].replace(",", "").strip()
+ return None
+
+ def gsm8k_score(example: dict, response: str) -> EvalRowResult:
+ expected = extract_answer(example.get("answer", ""))
+ predicted = extract_answer(response)
+ correct = expected is not None and predicted is not None and expected == predicted
+ return EvalRowResult(
+ score=1.0 if correct else 0.0,
+ response=response,
+ )
+
+
+@notebook_only
+@markdown
+def _eval_base_intro():
+ """
+ ## Evaluate the base model
+ """
+
+
+@code
+def _eval_base():
+ eval_config = EvalConfig(
+ dataset=eval_dataset,
+ eval_response_fn=gsm8k_score,
+ generate_kwargs={"chat_template_kwargs": {"enable_thinking": False}},
+ )
+ print("--- Running base model evaluation... ---")
+ base_eval = eval_config.evaluate(base_model_deployment, debug=True)
+ print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}")
+ print("--- Base model evaluation complete ---")
+
+
+@markdown
+def _train_intro():
+ """
+ ## Train with slime
+
+ Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math`
+ reward type. This reward extracts the final number from the model's
+ response and checks it against the label — the same logic as our eval
+ scoring function.
+
+ Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism:
+ - **TP=8:** tensor parallelism across all GPUs in a node.
+ - **PP=4:** pipeline parallelism across 4 stages.
+ - **EP=16:** expert parallelism shards the 160 experts across 16 groups.
+ - **CP=2:** context parallelism for long sequences.
+ - **CPU optimizer offload:** frees GPU memory for the large parameter count.
+ """
+
+
+@code
+def _define_training():
+ training_run = TrainConfig(
+ model=base_model,
+ dataset=train_dataset,
+ recipe=SlimeRecipe(
+ rm_type="math",
+
+ gpu_type="H100",
+ colocate=True,
+ actor_num_nodes=8,
+ actor_num_gpus_per_node=8,
+ tensor_model_parallel_size=8,
+ sequence_parallel=True,
+ rollout_num_gpus_per_engine=32,
+
+ expert_model_parallel_size=16,
+ expert_tensor_parallel_size=1,
+ pipeline_model_parallel_size=4,
+ context_parallel_size=2,
+ attention_backend="flash",
+
+ num_rollout=10,
+ rollout_batch_size=64,
+ rollout_max_response_len=4096,
+ rollout_temperature=1.0,
+ sglang_mem_fraction_static=0.70,
+
+ n_samples_per_prompt=8,
+ global_batch_size=512,
+ max_tokens_per_gpu=16384,
+
+ optimizer_cpu_offload=True,
+ overlap_cpu_optimizer_d2h_h2d=True,
+ use_precision_aware_optimizer=True,
+
+ save_interval=5,
+ apply_chat_template_kwargs='{"enable_thinking": false}',
+ ),
+ )
+
+
+@markdown
+def _launch_training():
+ """
+ ## Launch training
+
+ `TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster
+ (8 nodes × 8 H100), runs GRPO training, and returns a `TrainResult`.
+ """
+
+
+@code
+def _run_training():
+ print("--- Running training... ---")
+ train_result = training_run.train()
+ print("--- Training complete ---")
+
+
+@markdown
+def _eval_trained_intro():
+ """
+ ## Serve and evaluate the trained checkpoint
+
+ Let's deploy the trained checkpoint and run the same GSM8K eval.
+ """
+
+
+@code
+def _serve_trained():
+ checkpoint = list_checkpoints(train_result.training_run_id)[-1]
+ print(f"Checkpoint: {checkpoint.path}")
+
+ trained_model_deployment = DeploymentConfig(
+ model=GLM_4_7(),
+ checkpoint=checkpoint,
+ app_name="glm-4.7-gsm8k-serve",
+ served_model_name="glm-4.7-gsm8k",
+ ).serve()
+ print(f"Trained model deployed to {trained_model_deployment.url}")
+
+
+@code
+def _eval_trained():
+ print("--- Running trained model evaluation... ---")
+ trained_eval = eval_config.evaluate(trained_model_deployment, debug=True)
+ print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+ print("--- Trained model evaluation complete ---")
+
+
+@markdown
+def _compare_intro():
+ """
+ ## Compare results
+ """
+
+
+@code
+def _compare():
+ print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}")
+ print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+ improvement = trained_eval.mean - base_eval.mean
+ print(f"Improvement: {improvement:+.1%}")