diff --git a/README.md b/README.md
index 94704cc2..79c5214a 100644
--- a/README.md
+++ b/README.md
@@ -70,9 +70,10 @@ rest of the cells run as-is.
 
 | Tutorial | Summary | Difficulty | Framework | Launch |
 |---|---|---|---|---|
-| [`000_rl_basics`](tutorials/rl/000_rl_basics/000_rl_basics.ipynb) | Qwen3-4B haiku evaluation with verifiable rewards — serve, evaluate, train, compare | Beginner | `slime` | <a href="https://modal.com/notebooks/new/https://github.com/modal-projects/training-gym/blob/devin/1778791977-positive-rewards-001-sandboxes/tutorials/rl/000_rl_basics/000_rl_basics.ipynb" target="_blank" rel="nofollow noopener noreferrer"><img src="https://modal-cdn.com/open-in-modal.svg" alt="Open in Modal"></a> |
-| [`001_sandboxes`](tutorials/rl/001_sandboxes/001_sandboxes.ipynb) | Code RL with Harbor hello-world and sandboxed verification | Intermediate | `slime` | <a href="https://modal.com/notebooks/new/https://github.com/modal-projects/training-gym/blob/devin/1778791977-positive-rewards-001-sandboxes/tutorials/rl/001_sandboxes/001_sandboxes.ipynb" target="_blank" rel="nofollow noopener noreferrer"><img src="https://modal-cdn.com/open-in-modal.svg" alt="Open in Modal"></a> |
-| [`002_multiturn`](tutorials/rl/002_multiturn/002_multiturn.ipynb) | Multi-turn number-guessing RL with custom generate and reward functions | Intermediate | `slime` | <a href="https://modal.com/notebooks/new/https://github.com/modal-projects/training-gym/blob/devin/1778791977-positive-rewards-001-sandboxes/tutorials/rl/002_multiturn/002_multiturn.ipynb" target="_blank" rel="nofollow noopener noreferrer"><img src="https://modal-cdn.com/open-in-modal.svg" alt="Open in Modal"></a> |
+| [`000_rl_basics`](tutorials/rl/000_rl_basics/000_rl_basics.ipynb) | Qwen3-4B haiku evaluation with verifiable rewards — serve, evaluate, train, compare | Beginner | `slime` | <a href="https://modal.com/notebooks/new/https://github.com/modal-projects/training-gym/blob/main/tutorials/rl/000_rl_basics/000_rl_basics.ipynb" target="_blank" rel="nofollow noopener noreferrer"><img src="https://modal-cdn.com/open-in-modal.svg" alt="Open in Modal"></a> |
+| [`001_sandboxes`](tutorials/rl/001_sandboxes/001_sandboxes.ipynb) | Code RL with Harbor hello-world and sandboxed verification | Intermediate | `slime` | <a href="https://modal.com/notebooks/new/https://github.com/modal-projects/training-gym/blob/main/tutorials/rl/001_sandboxes/001_sandboxes.ipynb" target="_blank" rel="nofollow noopener noreferrer"><img src="https://modal-cdn.com/open-in-modal.svg" alt="Open in Modal"></a> |
+| [`002_multiturn`](tutorials/rl/002_multiturn/002_multiturn.ipynb) | Multi-turn number-guessing RL with custom generate and reward functions | Intermediate | `slime` | <a href="https://modal.com/notebooks/new/https://github.com/modal-projects/training-gym/blob/main/tutorials/rl/002_multiturn/002_multiturn.ipynb" target="_blank" rel="nofollow noopener noreferrer"><img src="https://modal-cdn.com/open-in-modal.svg" alt="Open in Modal"></a> |
+| [`003_glm_gsm8k`](tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb) | GLM-4.7 (355B MoE) on GSM8K math — serve, evaluate, GRPO-train, compare | Advanced | `slime` | <a href="https://modal.com/notebooks/new/https://github.com/modal-projects/training-gym/blob/main/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb" target="_blank" rel="nofollow noopener noreferrer"><img src="https://modal-cdn.com/open-in-modal.svg" alt="Open in Modal"></a> |
 <!-- END TUTORIAL TABLE -->
 
 See [`tutorials/README.md`](tutorials/README.md) for how to run the `.py`
diff --git a/modal_training_gym/__init__.py b/modal_training_gym/__init__.py
index a5e5ab6f..273084f8 100644
--- a/modal_training_gym/__init__.py
+++ b/modal_training_gym/__init__.py
@@ -10,6 +10,8 @@
     "EvalConfigDurable": ("modal_training_gym.common.eval", "EvalConfigDurable"),
     "EvalResult": ("modal_training_gym.common.eval", "EvalResult"),
     "EvalRowResult": ("modal_training_gym.common.eval", "EvalRowResult"),
+    "GLM_4_7": ("modal_training_gym.common.models", "GLM_4_7"),
+    "GLM_4_7_Flash": ("modal_training_gym.common.models", "GLM_4_7_Flash"),
     "HFModelConfiguration": (
         "modal_training_gym.common.models",
         "HFModelConfiguration",
@@ -60,6 +62,8 @@
     "ModelConfig",
     "ModelDeployment",
     "MultiTurn",
+    "GLM_4_7",
+    "GLM_4_7_Flash",
     "Qwen3_0_6B",
     "Qwen3_1_7B",
     "Qwen3_4B",
diff --git a/modal_training_gym/common/models/__init__.py b/modal_training_gym/common/models/__init__.py
index f2401e3a..8316d4ef 100644
--- a/modal_training_gym/common/models/__init__.py
+++ b/modal_training_gym/common/models/__init__.py
@@ -3,6 +3,8 @@
     ModelArchitecture,
     ModelConfig,
 )
+from .glm_4_7 import GLM_4_7
+from .glm_4_7_flash import GLM_4_7_Flash
 from .qwen3_0_6b import Qwen3_0_6B
 from .qwen3_1_7b import Qwen3_1_7B
 from .qwen3_4b import Qwen3_4B
@@ -12,6 +14,8 @@
 from .qwen3_32b import Qwen3_32B
 
 __all__ = [
+    "GLM_4_7",
+    "GLM_4_7_Flash",
     "HFModelConfiguration",
     "ModelArchitecture",
     "ModelConfig",
diff --git a/modal_training_gym/common/models/base.py b/modal_training_gym/common/models/base.py
index f8123bcc..5cafad13 100644
--- a/modal_training_gym/common/models/base.py
+++ b/modal_training_gym/common/models/base.py
@@ -75,6 +75,13 @@ class ModelArchitecture:
     use_rotary_position_embeddings: bool = True
     rotary_base: int = 10000
 
+    # MoE (Mixture of Experts)
+    num_experts: int = 0
+    moe_router_topk: int = 0
+    moe_ffn_hidden_size: int = 0
+    num_shared_experts: int = 0
+    first_k_dense_replace: int = 0
+
     def to_megatron_args(self) -> list[str]:
         """Generate Megatron-LM CLI flags from this architecture spec."""
         args: list[str] = []
@@ -111,6 +118,16 @@ def to_megatron_args(self) -> list[str]:
             args += ["--position-embedding-type", "rope"]
             if self.rotary_base != 10000:
                 args += ["--rotary-base", str(self.rotary_base)]
+        if self.num_experts:
+            args += ["--num-experts", str(self.num_experts)]
+        if self.moe_router_topk:
+            args += ["--moe-router-topk", str(self.moe_router_topk)]
+        if self.moe_ffn_hidden_size:
+            args += ["--moe-ffn-hidden-size", str(self.moe_ffn_hidden_size)]
+        if self.num_shared_experts:
+            args += ["--num-shared-experts", str(self.num_shared_experts)]
+        if self.first_k_dense_replace:
+            args += ["--first-k-dense-replace", str(self.first_k_dense_replace)]
         return args
 
 
diff --git a/modal_training_gym/common/models/glm_4_7.py b/modal_training_gym/common/models/glm_4_7.py
new file mode 100644
index 00000000..8e375a5e
--- /dev/null
+++ b/modal_training_gym/common/models/glm_4_7.py
@@ -0,0 +1,38 @@
+"""GLM-4.7 (355B-A32B MoE) model spec."""
+
+from .base import HFModelConfiguration, ModelArchitecture
+
+
+class GLM_4_7(HFModelConfiguration):
+    """GLM-4.7 (355B total, 32B active) MoE from Zhipu AI.
+
+    160 routed experts with top-8 routing plus 1 shared expert.
+    First 3 layers are dense; remaining 89 are MoE.
+    Uses GQA (96 Q heads, 8 KV heads) with partial RoPE.
+    Downloads from ``zai-org/GLM-4.7`` on HuggingFace.
+    """
+
+    model_name = "zai-org/GLM-4.7"
+    architecture = ModelArchitecture(
+        num_layers=92,
+        hidden_size=5120,
+        ffn_hidden_size=12288,
+        num_attention_heads=96,
+        group_query_attention=True,
+        num_query_groups=8,
+        kv_channels=128,
+        vocab_size=151552,
+        normalization="RMSNorm",
+        norm_epsilon=1e-5,
+        swiglu=True,
+        disable_bias_linear=False,
+        qk_layernorm=True,
+        untie_embeddings_and_output_weights=True,
+        use_rotary_position_embeddings=True,
+        rotary_base=1000000,
+        num_experts=160,
+        moe_router_topk=8,
+        moe_ffn_hidden_size=1536,
+        num_shared_experts=1,
+        first_k_dense_replace=3,
+    )
diff --git a/modal_training_gym/common/models/glm_4_7_flash.py b/modal_training_gym/common/models/glm_4_7_flash.py
new file mode 100644
index 00000000..6bb84538
--- /dev/null
+++ b/modal_training_gym/common/models/glm_4_7_flash.py
@@ -0,0 +1,36 @@
+"""GLM-4.7-Flash (30B-A3B MoE) model spec."""
+
+from .base import HFModelConfiguration, ModelArchitecture
+
+
+class GLM_4_7_Flash(HFModelConfiguration):
+    """GLM-4.7-Flash (30B total, 3B active) MoE from Zhipu AI.
+
+    64 routed experts with top-4 routing plus 1 shared expert.
+    Uses Multi-head Latent Attention (MLA) and multi-token prediction.
+    Downloads from ``zai-org/GLM-4.7-Flash`` on HuggingFace.
+    """
+
+    model_name = "zai-org/GLM-4.7-Flash"
+    architecture = ModelArchitecture(
+        num_layers=47,
+        hidden_size=2048,
+        ffn_hidden_size=10240,
+        num_attention_heads=20,
+        group_query_attention=False,
+        num_query_groups=20,
+        kv_channels=128,
+        vocab_size=154880,
+        normalization="RMSNorm",
+        norm_epsilon=1e-5,
+        swiglu=True,
+        disable_bias_linear=True,
+        qk_layernorm=True,
+        use_rotary_position_embeddings=True,
+        rotary_base=1000000,
+        num_experts=64,
+        moe_router_topk=4,
+        moe_ffn_hidden_size=1536,
+        num_shared_experts=1,
+        first_k_dense_replace=1,
+    )
diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py b/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py
index 173da428..faae4f75 100644
--- a/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py
+++ b/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py
@@ -1,4 +1,10 @@
 from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe
+from modal_training_gym.deploy_recipes.sglang_recipe.glm_4_7 import (
+    GLM_4_7_SglangRecipe,
+)
+from modal_training_gym.deploy_recipes.sglang_recipe.glm_4_7_flash import (
+    GLM_4_7_Flash_SglangRecipe,
+)
 from modal_training_gym.deploy_recipes.sglang_recipe.qwen3_0_6b import (
     Qwen3_0_6b_SglangRecipe,
 )
@@ -22,6 +28,8 @@
 )
 
 __all__ = [
+    "GLM_4_7_SglangRecipe",
+    "GLM_4_7_Flash_SglangRecipe",
     "SglangRecipe",
     "Qwen3_0_6b_SglangRecipe",
     "Qwen3_1_7b_SglangRecipe",
diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py
new file mode 100644
index 00000000..84383006
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py
@@ -0,0 +1,26 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe
+
+_GLM_4_7_DEFAULTS = {
+    "gpu": "H100",
+    "tp": 8,
+    "context_length": 32768,
+    "mem_fraction_static": 0.80,
+    "chunked_prefill_size": 8192,
+    "max_running_requests": 16,
+    "extra_server_args": {"--trust-remote-code": ""},
+}
+
+
+_SGLANG_DEFAULTS = SglangRecipe()
+
+
+@dataclass
+class GLM_4_7_SglangRecipe(SglangRecipe):
+    """GLM-4.7 (355B) on 8×H100 — tensor-parallel MoE serving."""
+
+    def __post_init__(self) -> None:
+        for key, val in _GLM_4_7_DEFAULTS.items():
+            if getattr(self, key) == getattr(_SGLANG_DEFAULTS, key):
+                object.__setattr__(self, key, val)
diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py
new file mode 100644
index 00000000..d248b63a
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py
@@ -0,0 +1,27 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe
+
+_GLM_4_7_FLASH_DEFAULTS = {
+    "gpu": "H100",
+    "tp": 1,
+    "dp": 8,
+    "context_length": 32768,
+    "mem_fraction_static": 0.80,
+    "chunked_prefill_size": 8192,
+    "max_running_requests": 16,
+    "extra_server_args": {"--trust-remote-code": ""},
+}
+
+
+_SGLANG_DEFAULTS = SglangRecipe()
+
+
+@dataclass
+class GLM_4_7_Flash_SglangRecipe(SglangRecipe):
+    """GLM-4.7-Flash on 8×H100 — DP-attention MoE serving."""
+
+    def __post_init__(self) -> None:
+        for key, val in _GLM_4_7_FLASH_DEFAULTS.items():
+            if getattr(self, key) == getattr(_SGLANG_DEFAULTS, key):
+                object.__setattr__(self, key, val)
diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py b/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py
index cdcb8a99..34091d67 100644
--- a/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py
+++ b/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py
@@ -1,4 +1,10 @@
 from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe
+from modal_training_gym.deploy_recipes.vllm_recipe.glm_4_7 import (
+    GLM_4_7_VllmRecipe,
+)
+from modal_training_gym.deploy_recipes.vllm_recipe.glm_4_7_flash import (
+    GLM_4_7_Flash_VllmRecipe,
+)
 from modal_training_gym.deploy_recipes.vllm_recipe.qwen3_0_6b import (
     Qwen3_0_6b_VllmRecipe,
 )
@@ -12,6 +18,8 @@
 from modal_training_gym.deploy_recipes.vllm_recipe.qwen3_32b import Qwen3_32b_VllmRecipe
 
 __all__ = [
+    "GLM_4_7_VllmRecipe",
+    "GLM_4_7_Flash_VllmRecipe",
     "VllmRecipe",
     "Qwen3_0_6b_VllmRecipe",
     "Qwen3_1_7b_VllmRecipe",
diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py
new file mode 100644
index 00000000..09010cf5
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe
+
+_GLM_4_7_DEFAULTS = {
+    "gpu": "H100",
+    "n_gpu": 8,
+    "extra_vllm_args": ["--trust-remote-code"],
+}
+
+_VLLM_DEFAULTS = VllmRecipe()
+
+
+@dataclass
+class GLM_4_7_VllmRecipe(VllmRecipe):
+    """GLM-4.7 (355B) on 8×H100 — tensor-parallel MoE serving."""
+
+    def __post_init__(self) -> None:
+        for key, val in _GLM_4_7_DEFAULTS.items():
+            if getattr(self, key) == getattr(_VLLM_DEFAULTS, key):
+                object.__setattr__(self, key, val)
diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py
new file mode 100644
index 00000000..50cc52c4
--- /dev/null
+++ b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py
@@ -0,0 +1,21 @@
+from dataclasses import dataclass
+
+from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe
+
+_GLM_4_7_FLASH_DEFAULTS = {
+    "gpu": "H100",
+    "n_gpu": 2,
+    "extra_vllm_args": ["--trust-remote-code"],
+}
+
+_VLLM_DEFAULTS = VllmRecipe()
+
+
+@dataclass
+class GLM_4_7_Flash_VllmRecipe(VllmRecipe):
+    """GLM-4.7-Flash on 2×H100 — tensor-parallel MoE serving."""
+
+    def __post_init__(self) -> None:
+        for key, val in _GLM_4_7_FLASH_DEFAULTS.items():
+            if getattr(self, key) == getattr(_VLLM_DEFAULTS, key):
+                object.__setattr__(self, key, val)
diff --git a/modal_training_gym/train_recipes/slime_recipe/__init__.py b/modal_training_gym/train_recipes/slime_recipe/__init__.py
index 2c8e5635..7487668b 100644
--- a/modal_training_gym/train_recipes/slime_recipe/__init__.py
+++ b/modal_training_gym/train_recipes/slime_recipe/__init__.py
@@ -3,6 +3,10 @@
     SlimeRecipeBlock,
 )
 from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe
+from modal_training_gym.train_recipes.slime_recipe.glm_4_7 import GLM_4_7_Recipe
+from modal_training_gym.train_recipes.slime_recipe.glm_4_7_flash import (
+    GLM_4_7_Flash_Recipe,
+)
 from modal_training_gym.train_recipes.slime_recipe.qwen3_1_7b import Qwen3_1_7b_Recipe
 from modal_training_gym.train_recipes.slime_recipe.qwen3_8b import Qwen3_8b_Recipe
 from modal_training_gym.train_recipes.slime_recipe.qwen3_14b import Qwen3_14b_Recipe
@@ -10,6 +14,8 @@
 from modal_training_gym.train_recipes.slime_recipe.qwen3_4b import Qwen3_4b_Recipe
 
 __all__ = [
+    "GLM_4_7_Recipe",
+    "GLM_4_7_Flash_Recipe",
     "MultiTurn",
     "SlimeRecipe",
     "SlimeRecipeBlock",
diff --git a/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py b/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py
new file mode 100644
index 00000000..2de0060a
--- /dev/null
+++ b/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py
@@ -0,0 +1,51 @@
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe
+
+
+@dataclass(config=ConfigDict(extra="forbid", arbitrary_types_allowed=True))
+class GLM_4_7_Recipe(SlimeRecipe):
+    """GLM-4.7 (355B-A32B MoE) on 8×8×H100, colocated GRPO.
+
+    TP=8, PP=4, CP=2, EP=16 across 8 nodes (64 GPUs).
+    Uses CPU optimizer offloading for the large parameter count.
+    """
+
+    gpu_type: str = "H100"
+    colocate: bool = True
+    actor_num_nodes: int = 8
+    actor_num_gpus_per_node: int = 8
+    tensor_model_parallel_size: int = 8
+    sequence_parallel: bool = True
+    rollout_num_gpus_per_engine: int = 32
+
+    # MoE parallelism
+    expert_model_parallel_size: int = 16
+    expert_tensor_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 4
+    context_parallel_size: int = 2
+    attention_backend: str | None = "flash"
+
+    # Rollout
+    num_rollout: int = 1
+    rollout_batch_size: int = 64
+    rollout_max_response_len: int = 4096
+    rollout_temperature: float = 1.0
+    sglang_mem_fraction_static: float = 0.70
+
+    save_interval: int = 10
+
+    # Training
+    n_samples_per_prompt: int = 8
+    global_batch_size: int = 512
+    lr: float = 1e-6
+    max_tokens_per_gpu: int = 16384
+
+    # Optimizer offloading (required for 355B model)
+    optimizer_cpu_offload: bool = True
+    overlap_cpu_optimizer_d2h_h2d: bool = True
+    use_precision_aware_optimizer: bool = True
+
+    eval_interval: int | None = 10
+    eval_max_response_len: int = 4096
diff --git a/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py b/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py
new file mode 100644
index 00000000..90ee6745
--- /dev/null
+++ b/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py
@@ -0,0 +1,56 @@
+from pydantic import ConfigDict
+from pydantic.dataclasses import dataclass
+
+from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe
+
+
+@dataclass(config=ConfigDict(extra="forbid", arbitrary_types_allowed=True))
+class GLM_4_7_Flash_Recipe(SlimeRecipe):
+    """GLM-4.7-Flash (30B-A3B MoE) on 1×8×H100, colocated GRPO.
+
+    TP=1, PP=1, EP=8 fits on a single 8-GPU node.
+    Uses MTP training, DeepEP, and CPU optimizer offloading.
+    """
+
+    gpu_type: str = "H100"
+    colocate: bool = True
+    tensor_model_parallel_size: int = 1
+    sequence_parallel: bool = False
+    rollout_num_gpus_per_engine: int = 8
+
+    # MoE parallelism
+    expert_model_parallel_size: int = 8
+    expert_tensor_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    context_parallel_size: int = 1
+    moe_token_dispatcher_type: str | None = "flex"
+    moe_enable_deepep: bool = True
+
+    # MTP
+    enable_mtp_training: bool = True
+    mtp_num_layers: int = 1
+    mtp_loss_scaling_factor: float = 0.2
+
+    # Rollout
+    num_rollout: int = 1
+    rollout_batch_size: int = 64
+    rollout_max_response_len: int = 4096
+    rollout_temperature: float = 1.0
+    sglang_mem_fraction_static: float = 0.70
+
+    save_interval: int = 10
+
+    # Training
+    n_samples_per_prompt: int = 8
+    global_batch_size: int = 512
+    lr: float = 1e-6
+    max_tokens_per_gpu: int = 32768
+    attention_backend: str | None = "flash"
+
+    # Optimizer offloading (MoE models are memory-heavy)
+    optimizer_cpu_offload: bool = True
+    overlap_cpu_optimizer_d2h_h2d: bool = True
+    use_precision_aware_optimizer: bool = True
+
+    eval_interval: int | None = 10
+    eval_max_response_len: int = 4096
diff --git a/modal_training_gym/train_recipes/slime_recipe/recipe.py b/modal_training_gym/train_recipes/slime_recipe/recipe.py
index 46ad418f..a68bc8a7 100644
--- a/modal_training_gym/train_recipes/slime_recipe/recipe.py
+++ b/modal_training_gym/train_recipes/slime_recipe/recipe.py
@@ -115,6 +115,25 @@ class SlimeRecipe(BaseTrainRecipe):
     adam_beta2: float = 0.98
     optimizer: str = "adam"
 
+    # ── MoE parallelism ──────────────────────────────────────────────────────
+    expert_model_parallel_size: int = 1
+    expert_tensor_parallel_size: int = 1
+    pipeline_model_parallel_size: int = 1
+    context_parallel_size: int = 1
+    moe_token_dispatcher_type: str | None = None
+    moe_enable_deepep: bool = False
+
+    # ── Multi-token prediction (MTP) ─────────────────────────────────────
+    enable_mtp_training: bool = False
+    mtp_num_layers: int = 0
+    mtp_loss_scaling_factor: float = 0.0
+
+    # ── Optimizer offloading ──────────────────────────────────────────────
+    optimizer_cpu_offload: bool = False
+    overlap_cpu_optimizer_d2h_h2d: bool = False
+    use_precision_aware_optimizer: bool = False
+    attention_backend: str | None = None
+
     # ── Memory and precision ────────────────────────────────────────────────
     attention_dropout: float = 0.0
     hidden_dropout: float = 0.0
@@ -252,7 +271,7 @@ def _validate_dataset(ds: "DatasetConfig") -> None:
     @staticmethod
     def _model_to_fields(m: "ModelConfig") -> dict[str, Any]:
         arch = SlimeRecipe._validate_custom_model_architecture(m)
-        return {
+        fields = {
             "hf_checkpoint": m.model_path or m.model_name,
             "num_layers": arch.num_layers,
             "hidden_size": arch.hidden_size,
@@ -271,6 +290,17 @@ def _model_to_fields(m: "ModelConfig") -> dict[str, Any]:
             "use_rotary_position_embeddings": arch.use_rotary_position_embeddings,
             "rotary_base": arch.rotary_base,
         }
+        if arch.num_experts:
+            fields["num_experts"] = arch.num_experts
+        if arch.moe_router_topk:
+            fields["moe_router_topk"] = arch.moe_router_topk
+        if arch.moe_ffn_hidden_size:
+            fields["moe_ffn_hidden_size"] = arch.moe_ffn_hidden_size
+        if arch.num_shared_experts:
+            fields["num_shared_experts"] = arch.num_shared_experts
+        if arch.first_k_dense_replace:
+            fields["first_k_dense_replace"] = arch.first_k_dense_replace
+        return fields
 
     @staticmethod
     def _wandb_to_fields(w: "WandbConfig") -> dict[str, Any]:
@@ -388,4 +418,16 @@ def get_base_recipe(cls, model_config: ModelConfig) -> "SlimeRecipe | None":
             return Qwen3_14b_Recipe()
         if model_config.model_name == "Qwen/Qwen3-32B":
             return Qwen3_32b_Recipe()
+        if model_config.model_name == "zai-org/GLM-4.7":
+            from modal_training_gym.train_recipes.slime_recipe.glm_4_7 import (
+                GLM_4_7_Recipe,
+            )
+
+            return GLM_4_7_Recipe()
+        if model_config.model_name == "zai-org/GLM-4.7-Flash":
+            from modal_training_gym.train_recipes.slime_recipe.glm_4_7_flash import (
+                GLM_4_7_Flash_Recipe,
+            )
+
+            return GLM_4_7_Flash_Recipe()
         return None
diff --git a/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb
new file mode 100644
index 00000000..e9c79f21
--- /dev/null
+++ b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb
@@ -0,0 +1,434 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "cell-000",
+   "metadata": {},
+   "source": [
+    "# Training GLM-4.7 on GSM8K\n",
+    "\n",
+    "This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7),\n",
+    "a 355B-parameter Mixture-of-Experts model (32B active per token),\n",
+    "on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k).\n",
+    "\n",
+    "GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing.\n",
+    "The first 3 layers are dense; the remaining 89 use sparse expert routing.\n",
+    "Training a model this size requires multi-node parallelism:\n",
+    "\n",
+    "1. Serving the base model with SGLang on 8\u00d7H100.\n",
+    "2. Loading the GSM8K dataset and defining a math-answer scorer.\n",
+    "3. Evaluating the base model.\n",
+    "4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward.\n",
+    "5. Evaluating the trained checkpoint and comparing.\n",
+    "\n",
+    "**Cluster shape:** 8 nodes \u00d7 8 H100 GPUs (64 GPUs total). Training uses\n",
+    "TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-001",
+   "metadata": {},
+   "source": [
+    "## Prerequisites\n",
+    "\n",
+    "This tutorial requires a Modal Secret named `huggingface-secret` containing your\n",
+    "`HF_TOKEN`. Create one at [modal.com/secrets](https://modal.com/secrets) if you\n",
+    "haven't already \u2014 the cell below fails fast with instructions otherwise."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-002",
+   "metadata": {},
+   "source": [
+    "> **Note:** you do **not** need to attach a GPU to this notebook. All training and\n",
+    "> serving happens on Modal-managed GPU workers spun up by the SDK \u2014 the notebook\n",
+    "> itself only needs to issue API calls."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-003",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import modal\n",
+    "\n",
+    "try:\n",
+    "    modal.Secret.from_name(\"huggingface-secret\").hydrate()\n",
+    "except modal.exception.NotFoundError as e:\n",
+    "    raise RuntimeError(\n",
+    "        \"Missing Modal Secret 'huggingface-secret'. Create one at \"\n",
+    "        \"https://modal.com/secrets with an HF_TOKEN entry, then re-run.\"\n",
+    "    ) from e"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-004",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "%uv pip install -q git+https://github.com/modal-projects/training-gym.git@main"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-005",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import re\n",
+    "\n",
+    "from modal_training_gym import (\n",
+    "    DeploymentConfig,\n",
+    "    EvalConfig,\n",
+    "    EvalRowResult,\n",
+    "    GLM_4_7,\n",
+    "    HuggingFaceDataset,\n",
+    "    SlimeRecipe,\n",
+    "    TrainConfig,\n",
+    "    list_checkpoints,\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-006",
+   "metadata": {},
+   "source": [
+    "## Serve the base model\n",
+    "\n",
+    "First, let's deploy GLM-4.7 so we can test it.\n",
+    "The 355B model needs 8\u00d7H100 GPUs for serving with tensor parallelism."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-007",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "base_model = GLM_4_7()\n",
+    "base_model_deployment = DeploymentConfig(\n",
+    "    model=base_model,\n",
+    ").serve()\n",
+    "print(f\"Base model deployed to {base_model_deployment.url}\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-008",
+   "metadata": {},
+   "source": [
+    "Let's try asking it a math question to see how it responds."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-009",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "response = base_model_deployment.generate(\n",
+    "    \"What is 24 * 37?\",\n",
+    "    chat_template_kwargs={\"enable_thinking\": False},\n",
+    ")\n",
+    "print(response)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-010",
+   "metadata": {},
+   "source": [
+    "## Define the GSM8K dataset\n",
+    "\n",
+    "GSM8K contains grade-school math word problems. Each row has a `question`\n",
+    "and an `answer` that ends with `#### <number>` \u2014 the final numerical answer.\n",
+    "\n",
+    "We'll use the answer extraction pattern to score the model: extract the\n",
+    "number after `####` from the label and compare it to the model's final number."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-011",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "class GSM8KDataset(HuggingFaceDataset):\n",
+    "    hf_repo = \"openai/gsm8k\"\n",
+    "    hf_config = \"main\"\n",
+    "    input_column = \"question\"\n",
+    "    output_column = \"answer\"\n",
+    "    output_format = \"jsonl\"\n",
+    "    apply_chat_template = True\n",
+    "    system_prompt = (\n",
+    "        \"You are a math problem solver. Solve the given math problem \"\n",
+    "        \"step by step, then give your final answer as a number on the \"\n",
+    "        \"last line prefixed with ####. For example: #### 42\"\n",
+    "    )\n",
+    "    always_prepare = True\n",
+    "\n",
+    "train_dataset = GSM8KDataset(n_rows=50)\n",
+    "eval_dataset = GSM8KDataset(n_rows=20, hf_split=\"test\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-012",
+   "metadata": {},
+   "source": [
+    "Let's look at a few examples from the dataset."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-013",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "df = eval_dataset.to_pandas()\n",
+    "print(f\"Eval set: {len(df)} rows\")\n",
+    "df.head(3)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-014",
+   "metadata": {},
+   "source": [
+    "## Define the scoring function\n",
+    "\n",
+    "GSM8K answers end with `#### <number>`. We extract the number from both\n",
+    "the label and the model's response, then check if they match."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-015",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def extract_answer(text: str) -> str | None:\n",
+    "    match = re.search(r\"####\\s*(-?[\\d,]+(?:\\.\\d+)?)\", text)\n",
+    "    if match:\n",
+    "        return match.group(1).replace(\",\", \"\").strip()\n",
+    "    numbers = re.findall(r\"-?[\\d,]+(?:\\.\\d+)?\", text)\n",
+    "    if numbers:\n",
+    "        return numbers[-1].replace(\",\", \"\").strip()\n",
+    "    return None\n",
+    "\n",
+    "def gsm8k_score(example: dict, response: str) -> EvalRowResult:\n",
+    "    expected = extract_answer(example.get(\"answer\", \"\"))\n",
+    "    predicted = extract_answer(response)\n",
+    "    correct = expected is not None and predicted is not None and expected == predicted\n",
+    "    return EvalRowResult(\n",
+    "        score=1.0 if correct else 0.0,\n",
+    "        response=response,\n",
+    "    )"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-016",
+   "metadata": {},
+   "source": [
+    "## Evaluate the base model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-017",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "eval_config = EvalConfig(\n",
+    "    dataset=eval_dataset,\n",
+    "    eval_response_fn=gsm8k_score,\n",
+    "    generate_kwargs={\"chat_template_kwargs\": {\"enable_thinking\": False}},\n",
+    ")\n",
+    "print(\"--- Running base model evaluation... ---\")\n",
+    "base_eval = eval_config.evaluate(base_model_deployment, debug=True)\n",
+    "print(f\"Base model GSM8K accuracy: {base_eval.mean:.1%}\")\n",
+    "print(\"--- Base model evaluation complete ---\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-018",
+   "metadata": {},
+   "source": [
+    "## Train with slime\n",
+    "\n",
+    "Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math`\n",
+    "reward type. This reward extracts the final number from the model's\n",
+    "response and checks it against the label \u2014 the same logic as our eval\n",
+    "scoring function.\n",
+    "\n",
+    "Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism:\n",
+    "- **TP=8:** tensor parallelism across all GPUs in a node.\n",
+    "- **PP=4:** pipeline parallelism across 4 stages.\n",
+    "- **EP=16:** expert parallelism shards the 160 experts across 16 groups.\n",
+    "- **CP=2:** context parallelism for long sequences.\n",
+    "- **CPU optimizer offload:** frees GPU memory for the large parameter count."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-019",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "training_run = TrainConfig(\n",
+    "    model=base_model,\n",
+    "    dataset=train_dataset,\n",
+    "    recipe=SlimeRecipe(\n",
+    "        rm_type=\"math\",\n",
+    "\n",
+    "        gpu_type=\"H100\",\n",
+    "        colocate=True,\n",
+    "        actor_num_nodes=8,\n",
+    "        actor_num_gpus_per_node=8,\n",
+    "        tensor_model_parallel_size=8,\n",
+    "        sequence_parallel=True,\n",
+    "        rollout_num_gpus_per_engine=32,\n",
+    "\n",
+    "        expert_model_parallel_size=16,\n",
+    "        expert_tensor_parallel_size=1,\n",
+    "        pipeline_model_parallel_size=4,\n",
+    "        context_parallel_size=2,\n",
+    "        attention_backend=\"flash\",\n",
+    "\n",
+    "        num_rollout=10,\n",
+    "        rollout_batch_size=64,\n",
+    "        rollout_max_response_len=4096,\n",
+    "        rollout_temperature=1.0,\n",
+    "        sglang_mem_fraction_static=0.70,\n",
+    "\n",
+    "        n_samples_per_prompt=8,\n",
+    "        global_batch_size=512,\n",
+    "        max_tokens_per_gpu=16384,\n",
+    "\n",
+    "        optimizer_cpu_offload=True,\n",
+    "        overlap_cpu_optimizer_d2h_h2d=True,\n",
+    "        use_precision_aware_optimizer=True,\n",
+    "\n",
+    "        save_interval=5,\n",
+    "        apply_chat_template_kwargs='{\"enable_thinking\": false}',\n",
+    "    ),\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-020",
+   "metadata": {},
+   "source": [
+    "## Launch training\n",
+    "\n",
+    "`TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster\n",
+    "(8 nodes \u00d7 8 H100), runs GRPO training, and returns a `TrainResult`."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-021",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"--- Running training... ---\")\n",
+    "train_result = training_run.train()\n",
+    "print(\"--- Training complete ---\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-022",
+   "metadata": {},
+   "source": [
+    "## Serve and evaluate the trained checkpoint\n",
+    "\n",
+    "Let's deploy the trained checkpoint and run the same GSM8K eval."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-023",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "checkpoint = list_checkpoints(train_result.training_run_id)[-1]\n",
+    "print(f\"Checkpoint: {checkpoint.path}\")\n",
+    "\n",
+    "trained_model_deployment = DeploymentConfig(\n",
+    "    model=GLM_4_7(),\n",
+    "    checkpoint=checkpoint,\n",
+    "    app_name=\"glm-4.7-gsm8k-serve\",\n",
+    "    served_model_name=\"glm-4.7-gsm8k\",\n",
+    ").serve()\n",
+    "print(f\"Trained model deployed to {trained_model_deployment.url}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-024",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(\"--- Running trained model evaluation... ---\")\n",
+    "trained_eval = eval_config.evaluate(trained_model_deployment, debug=True)\n",
+    "print(f\"Trained model GSM8K accuracy: {trained_eval.mean:.1%}\")\n",
+    "print(\"--- Trained model evaluation complete ---\")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cell-025",
+   "metadata": {},
+   "source": [
+    "## Compare results"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "cell-026",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(f\"Base model GSM8K accuracy:    {base_eval.mean:.1%}\")\n",
+    "print(f\"Trained model GSM8K accuracy: {trained_eval.mean:.1%}\")\n",
+    "improvement = trained_eval.mean - base_eval.mean\n",
+    "print(f\"Improvement: {improvement:+.1%}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "name": "python"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
new file mode 100644
index 00000000..55bf855b
--- /dev/null
+++ b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
@@ -0,0 +1,223 @@
+# Generated by generate_tutorial.py — do not edit directly.
+# Source: tutorials/tutorial_generator/rl/003_glm_gsm8k.py
+
+# # Training GLM-4.7 on GSM8K
+#
+# This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7),
+# a 355B-parameter Mixture-of-Experts model (32B active per token),
+# on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k).
+#
+# GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing.
+# The first 3 layers are dense; the remaining 89 use sparse expert routing.
+# Training a model this size requires multi-node parallelism:
+#
+# 1. Serving the base model with SGLang on 8×H100.
+# 2. Loading the GSM8K dataset and defining a math-answer scorer.
+# 3. Evaluating the base model.
+# 4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward.
+# 5. Evaluating the trained checkpoint and comparing.
+#
+# **Cluster shape:** 8 nodes × 8 H100 GPUs (64 GPUs total). Training uses
+# TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs.
+# To run the tutorial, run the following command:
+# ```
+# uv run python tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
+# ```
+# ## Prerequisites
+#
+# This tutorial requires a Modal Secret named `huggingface-secret` containing your
+# `HF_TOKEN`. Create one at [modal.com/secrets](https://modal.com/secrets) if you
+# haven't already — the cell below fails fast with instructions otherwise.
+
+import modal
+
+import re
+
+from modal_training_gym import (
+    DeploymentConfig,
+    EvalConfig,
+    EvalRowResult,
+    GLM_4_7,
+    HuggingFaceDataset,
+    SlimeRecipe,
+    TrainConfig,
+    list_checkpoints,
+)
+
+# ## Define the GSM8K dataset
+#
+# GSM8K contains grade-school math word problems. Each row has a `question`
+# and an `answer` that ends with `#### <number>` — the final numerical answer.
+#
+# We'll use the answer extraction pattern to score the model: extract the
+# number after `####` from the label and compare it to the model's final number.
+
+class GSM8KDataset(HuggingFaceDataset):
+    hf_repo = "openai/gsm8k"
+    hf_config = "main"
+    input_column = "question"
+    output_column = "answer"
+    output_format = "jsonl"
+    apply_chat_template = True
+    system_prompt = (
+        "You are a math problem solver. Solve the given math problem "
+        "step by step, then give your final answer as a number on the "
+        "last line prefixed with ####. For example: #### 42"
+    )
+    always_prepare = True
+
+# ## Define the scoring function
+#
+# GSM8K answers end with `#### <number>`. We extract the number from both
+# the label and the model's response, then check if they match.
+
+def extract_answer(text: str) -> str | None:
+    match = re.search(r"####\s*(-?[\d,]+(?:\.\d+)?)", text)
+    if match:
+        return match.group(1).replace(",", "").strip()
+    numbers = re.findall(r"-?[\d,]+(?:\.\d+)?", text)
+    if numbers:
+        return numbers[-1].replace(",", "").strip()
+    return None
+
+def gsm8k_score(example: dict, response: str) -> EvalRowResult:
+    expected = extract_answer(example.get("answer", ""))
+    predicted = extract_answer(response)
+    correct = expected is not None and predicted is not None and expected == predicted
+    return EvalRowResult(
+        score=1.0 if correct else 0.0,
+        response=response,
+    )
+
+import modal
+
+tutorial_cli_app = modal.App()
+
+def _main_impl() -> None:
+    try:
+        modal.Secret.from_name("huggingface-secret").hydrate()
+    except modal.exception.NotFoundError as e:
+        raise RuntimeError(
+            "Missing Modal Secret 'huggingface-secret'. Create one at "
+            "https://modal.com/secrets with an HF_TOKEN entry, then re-run."
+        ) from e
+
+    # ## Serve the base model
+    #
+    # First, let's deploy GLM-4.7 so we can test it.
+    # The 355B model needs 8×H100 GPUs for serving with tensor parallelism.
+
+    base_model = GLM_4_7()
+    base_model_deployment = DeploymentConfig(
+        model=base_model,
+    ).serve()
+    print(f"Base model deployed to {base_model_deployment.url}")
+
+    train_dataset = GSM8KDataset(n_rows=50)
+    eval_dataset = GSM8KDataset(n_rows=20, hf_split="test")
+
+    eval_config = EvalConfig(
+        dataset=eval_dataset,
+        eval_response_fn=gsm8k_score,
+        generate_kwargs={"chat_template_kwargs": {"enable_thinking": False}},
+    )
+    print("--- Running base model evaluation... ---")
+    base_eval = eval_config.evaluate(base_model_deployment, debug=True)
+    print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}")
+    print("--- Base model evaluation complete ---")
+
+    # ## Train with slime
+    #
+    # Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math`
+    # reward type. This reward extracts the final number from the model's
+    # response and checks it against the label — the same logic as our eval
+    # scoring function.
+    #
+    # Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism:
+    # - **TP=8:** tensor parallelism across all GPUs in a node.
+    # - **PP=4:** pipeline parallelism across 4 stages.
+    # - **EP=16:** expert parallelism shards the 160 experts across 16 groups.
+    # - **CP=2:** context parallelism for long sequences.
+    # - **CPU optimizer offload:** frees GPU memory for the large parameter count.
+
+    training_run = TrainConfig(
+        model=base_model,
+        dataset=train_dataset,
+        recipe=SlimeRecipe(
+            rm_type="math",
+
+            gpu_type="H100",
+            colocate=True,
+            actor_num_nodes=8,
+            actor_num_gpus_per_node=8,
+            tensor_model_parallel_size=8,
+            sequence_parallel=True,
+            rollout_num_gpus_per_engine=32,
+
+            expert_model_parallel_size=16,
+            expert_tensor_parallel_size=1,
+            pipeline_model_parallel_size=4,
+            context_parallel_size=2,
+            attention_backend="flash",
+
+            num_rollout=10,
+            rollout_batch_size=64,
+            rollout_max_response_len=4096,
+            rollout_temperature=1.0,
+            sglang_mem_fraction_static=0.70,
+
+            n_samples_per_prompt=8,
+            global_batch_size=512,
+            max_tokens_per_gpu=16384,
+
+            optimizer_cpu_offload=True,
+            overlap_cpu_optimizer_d2h_h2d=True,
+            use_precision_aware_optimizer=True,
+
+            save_interval=5,
+            apply_chat_template_kwargs='{"enable_thinking": false}',
+        ),
+    )
+
+    # ## Launch training
+    #
+    # `TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster
+    # (8 nodes × 8 H100), runs GRPO training, and returns a `TrainResult`.
+
+    print("--- Running training... ---")
+    train_result = training_run.train()
+    print("--- Training complete ---")
+
+    # ## Serve and evaluate the trained checkpoint
+    #
+    # Let's deploy the trained checkpoint and run the same GSM8K eval.
+
+    checkpoint = list_checkpoints(train_result.training_run_id)[-1]
+    print(f"Checkpoint: {checkpoint.path}")
+
+    trained_model_deployment = DeploymentConfig(
+        model=GLM_4_7(),
+        checkpoint=checkpoint,
+        app_name="glm-4.7-gsm8k-serve",
+        served_model_name="glm-4.7-gsm8k",
+    ).serve()
+    print(f"Trained model deployed to {trained_model_deployment.url}")
+
+    print("--- Running trained model evaluation... ---")
+    trained_eval = eval_config.evaluate(trained_model_deployment, debug=True)
+    print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+    print("--- Trained model evaluation complete ---")
+
+    # ## Compare results
+
+    print(f"Base model GSM8K accuracy:    {base_eval.mean:.1%}")
+    print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+    improvement = trained_eval.mean - base_eval.mean
+    print(f"Improvement: {improvement:+.1%}")
+
+@tutorial_cli_app.local_entrypoint()
+def main() -> None:
+    _main_impl()
+
+if __name__ == "__main__":
+    main()
diff --git a/tutorials/tutorial_generator/rl/003_glm_gsm8k.py b/tutorials/tutorial_generator/rl/003_glm_gsm8k.py
new file mode 100644
index 00000000..c69081cf
--- /dev/null
+++ b/tutorials/tutorial_generator/rl/003_glm_gsm8k.py
@@ -0,0 +1,340 @@
+"""Tutorial source for `003_glm_gsm8k` — parsed by generate_tutorial.py."""
+
+TUTORIAL_METADATA = {
+    "framework": "`slime`",
+    "cluster_shape": "8 × 8×H100",
+    "summary": "GLM-4.7 (355B MoE) on GSM8K math — serve, evaluate, GRPO-train, compare",
+    "difficulty": "Advanced",
+    "order": 30,
+    "api_classes": [
+        "GLM_4_7",
+        "DeploymentConfig",
+        "EvalConfig",
+        "EvalRowResult",
+        "TrainConfig",
+        "SlimeRecipe",
+        "TrainResult",
+    ],
+}
+
+
+from tutorial_generator import code, markdown, notebook_only, py_only, shell
+
+
+@markdown
+def _intro():
+    """
+    # Training GLM-4.7 on GSM8K
+
+    This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7),
+    a 355B-parameter Mixture-of-Experts model (32B active per token),
+    on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k).
+
+    GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing.
+    The first 3 layers are dense; the remaining 89 use sparse expert routing.
+    Training a model this size requires multi-node parallelism:
+
+    1. Serving the base model with SGLang on 8×H100.
+    2. Loading the GSM8K dataset and defining a math-answer scorer.
+    3. Evaluating the base model.
+    4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward.
+    5. Evaluating the trained checkpoint and comparing.
+
+    **Cluster shape:** 8 nodes × 8 H100 GPUs (64 GPUs total). Training uses
+    TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs.
+    """
+
+
+@py_only
+@markdown
+def run_instructions():
+    """
+    To run the tutorial, run the following command:
+    ```
+    uv run python tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py
+    ```
+    """
+
+
+@notebook_only
+@shell("%uv pip install -q git+https://github.com/modal-projects/training-gym.git@main")
+def _install():
+    pass
+
+
+@code
+def _imports():
+    import re
+
+    from modal_training_gym import (
+        DeploymentConfig,
+        EvalConfig,
+        EvalRowResult,
+        GLM_4_7,
+        HuggingFaceDataset,
+        SlimeRecipe,
+        TrainConfig,
+        list_checkpoints,
+    )
+
+
+@markdown
+def _serve_base_intro():
+    """
+    ## Serve the base model
+
+    First, let's deploy GLM-4.7 so we can test it.
+    The 355B model needs 8×H100 GPUs for serving with tensor parallelism.
+    """
+
+
+@code
+def _serve_base_model():
+    base_model = GLM_4_7()
+    base_model_deployment = DeploymentConfig(
+        model=base_model,
+    ).serve()
+    print(f"Base model deployed to {base_model_deployment.url}")
+
+
+@notebook_only
+@markdown
+def _try_base_model():
+    """
+    Let's try asking it a math question to see how it responds.
+    """
+
+
+@notebook_only
+@code
+def _try_base_model_code():
+    response = base_model_deployment.generate(
+        "What is 24 * 37?",
+        chat_template_kwargs={"enable_thinking": False},
+    )
+    print(response)
+
+
+@markdown
+def _dataset_intro():
+    """
+    ## Define the GSM8K dataset
+
+    GSM8K contains grade-school math word problems. Each row has a `question`
+    and an `answer` that ends with `#### <number>` — the final numerical answer.
+
+    We'll use the answer extraction pattern to score the model: extract the
+    number after `####` from the label and compare it to the model's final number.
+    """
+
+
+@code
+def _define_dataset():
+    class GSM8KDataset(HuggingFaceDataset):
+        hf_repo = "openai/gsm8k"
+        hf_config = "main"
+        input_column = "question"
+        output_column = "answer"
+        output_format = "jsonl"
+        apply_chat_template = True
+        system_prompt = (
+            "You are a math problem solver. Solve the given math problem "
+            "step by step, then give your final answer as a number on the "
+            "last line prefixed with ####. For example: #### 42"
+        )
+        always_prepare = True
+
+    train_dataset = GSM8KDataset(n_rows=50)
+    eval_dataset = GSM8KDataset(n_rows=20, hf_split="test")
+
+
+@notebook_only
+@markdown
+def _peek_dataset():
+    """
+    Let's look at a few examples from the dataset.
+    """
+
+
+@notebook_only
+@code
+def _peek_dataset_code():
+    df = eval_dataset.to_pandas()
+    print(f"Eval set: {len(df)} rows")
+    df.head(3)
+
+
+@markdown
+def _scoring_intro():
+    """
+    ## Define the scoring function
+
+    GSM8K answers end with `#### <number>`. We extract the number from both
+    the label and the model's response, then check if they match.
+    """
+
+
+@code
+def _define_scoring():
+    def extract_answer(text: str) -> str | None:
+        match = re.search(r"####\s*(-?[\d,]+(?:\.\d+)?)", text)
+        if match:
+            return match.group(1).replace(",", "").strip()
+        numbers = re.findall(r"-?[\d,]+(?:\.\d+)?", text)
+        if numbers:
+            return numbers[-1].replace(",", "").strip()
+        return None
+
+    def gsm8k_score(example: dict, response: str) -> EvalRowResult:
+        expected = extract_answer(example.get("answer", ""))
+        predicted = extract_answer(response)
+        correct = expected is not None and predicted is not None and expected == predicted
+        return EvalRowResult(
+            score=1.0 if correct else 0.0,
+            response=response,
+        )
+
+
+@notebook_only
+@markdown
+def _eval_base_intro():
+    """
+    ## Evaluate the base model
+    """
+
+
+@code
+def _eval_base():
+    eval_config = EvalConfig(
+        dataset=eval_dataset,
+        eval_response_fn=gsm8k_score,
+        generate_kwargs={"chat_template_kwargs": {"enable_thinking": False}},
+    )
+    print("--- Running base model evaluation... ---")
+    base_eval = eval_config.evaluate(base_model_deployment, debug=True)
+    print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}")
+    print("--- Base model evaluation complete ---")
+
+
+@markdown
+def _train_intro():
+    """
+    ## Train with slime
+
+    Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math`
+    reward type. This reward extracts the final number from the model's
+    response and checks it against the label — the same logic as our eval
+    scoring function.
+
+    Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism:
+    - **TP=8:** tensor parallelism across all GPUs in a node.
+    - **PP=4:** pipeline parallelism across 4 stages.
+    - **EP=16:** expert parallelism shards the 160 experts across 16 groups.
+    - **CP=2:** context parallelism for long sequences.
+    - **CPU optimizer offload:** frees GPU memory for the large parameter count.
+    """
+
+
+@code
+def _define_training():
+    training_run = TrainConfig(
+        model=base_model,
+        dataset=train_dataset,
+        recipe=SlimeRecipe(
+            rm_type="math",
+
+            gpu_type="H100",
+            colocate=True,
+            actor_num_nodes=8,
+            actor_num_gpus_per_node=8,
+            tensor_model_parallel_size=8,
+            sequence_parallel=True,
+            rollout_num_gpus_per_engine=32,
+
+            expert_model_parallel_size=16,
+            expert_tensor_parallel_size=1,
+            pipeline_model_parallel_size=4,
+            context_parallel_size=2,
+            attention_backend="flash",
+
+            num_rollout=10,
+            rollout_batch_size=64,
+            rollout_max_response_len=4096,
+            rollout_temperature=1.0,
+            sglang_mem_fraction_static=0.70,
+
+            n_samples_per_prompt=8,
+            global_batch_size=512,
+            max_tokens_per_gpu=16384,
+
+            optimizer_cpu_offload=True,
+            overlap_cpu_optimizer_d2h_h2d=True,
+            use_precision_aware_optimizer=True,
+
+            save_interval=5,
+            apply_chat_template_kwargs='{"enable_thinking": false}',
+        ),
+    )
+
+
+@markdown
+def _launch_training():
+    """
+    ## Launch training
+
+    `TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster
+    (8 nodes × 8 H100), runs GRPO training, and returns a `TrainResult`.
+    """
+
+
+@code
+def _run_training():
+    print("--- Running training... ---")
+    train_result = training_run.train()
+    print("--- Training complete ---")
+
+
+@markdown
+def _eval_trained_intro():
+    """
+    ## Serve and evaluate the trained checkpoint
+
+    Let's deploy the trained checkpoint and run the same GSM8K eval.
+    """
+
+
+@code
+def _serve_trained():
+    checkpoint = list_checkpoints(train_result.training_run_id)[-1]
+    print(f"Checkpoint: {checkpoint.path}")
+
+    trained_model_deployment = DeploymentConfig(
+        model=GLM_4_7(),
+        checkpoint=checkpoint,
+        app_name="glm-4.7-gsm8k-serve",
+        served_model_name="glm-4.7-gsm8k",
+    ).serve()
+    print(f"Trained model deployed to {trained_model_deployment.url}")
+
+
+@code
+def _eval_trained():
+    print("--- Running trained model evaluation... ---")
+    trained_eval = eval_config.evaluate(trained_model_deployment, debug=True)
+    print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+    print("--- Trained model evaluation complete ---")
+
+
+@markdown
+def _compare_intro():
+    """
+    ## Compare results
+    """
+
+
+@code
+def _compare():
+    print(f"Base model GSM8K accuracy:    {base_eval.mean:.1%}")
+    print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}")
+    improvement = trained_eval.mean - base_eval.mean
+    print(f"Improvement: {improvement:+.1%}")