diff --git a/README.md b/README.md index 94704cc2..79c5214a 100644 --- a/README.md +++ b/README.md @@ -70,9 +70,10 @@ rest of the cells run as-is. | Tutorial | Summary | Difficulty | Framework | Launch | |---|---|---|---|---| -| [`000_rl_basics`](tutorials/rl/000_rl_basics/000_rl_basics.ipynb) | Qwen3-4B haiku evaluation with verifiable rewards — serve, evaluate, train, compare | Beginner | `slime` | Open in Modal | -| [`001_sandboxes`](tutorials/rl/001_sandboxes/001_sandboxes.ipynb) | Code RL with Harbor hello-world and sandboxed verification | Intermediate | `slime` | Open in Modal | -| [`002_multiturn`](tutorials/rl/002_multiturn/002_multiturn.ipynb) | Multi-turn number-guessing RL with custom generate and reward functions | Intermediate | `slime` | Open in Modal | +| [`000_rl_basics`](tutorials/rl/000_rl_basics/000_rl_basics.ipynb) | Qwen3-4B haiku evaluation with verifiable rewards — serve, evaluate, train, compare | Beginner | `slime` | Open in Modal | +| [`001_sandboxes`](tutorials/rl/001_sandboxes/001_sandboxes.ipynb) | Code RL with Harbor hello-world and sandboxed verification | Intermediate | `slime` | Open in Modal | +| [`002_multiturn`](tutorials/rl/002_multiturn/002_multiturn.ipynb) | Multi-turn number-guessing RL with custom generate and reward functions | Intermediate | `slime` | Open in Modal | +| [`003_glm_gsm8k`](tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb) | GLM-4.7 (355B MoE) on GSM8K math — serve, evaluate, GRPO-train, compare | Advanced | `slime` | Open in Modal | See [`tutorials/README.md`](tutorials/README.md) for how to run the `.py` diff --git a/modal_training_gym/__init__.py b/modal_training_gym/__init__.py index a5e5ab6f..273084f8 100644 --- a/modal_training_gym/__init__.py +++ b/modal_training_gym/__init__.py @@ -10,6 +10,8 @@ "EvalConfigDurable": ("modal_training_gym.common.eval", "EvalConfigDurable"), "EvalResult": ("modal_training_gym.common.eval", "EvalResult"), "EvalRowResult": ("modal_training_gym.common.eval", "EvalRowResult"), + "GLM_4_7": ("modal_training_gym.common.models", "GLM_4_7"), + "GLM_4_7_Flash": ("modal_training_gym.common.models", "GLM_4_7_Flash"), "HFModelConfiguration": ( "modal_training_gym.common.models", "HFModelConfiguration", @@ -60,6 +62,8 @@ "ModelConfig", "ModelDeployment", "MultiTurn", + "GLM_4_7", + "GLM_4_7_Flash", "Qwen3_0_6B", "Qwen3_1_7B", "Qwen3_4B", diff --git a/modal_training_gym/common/models/__init__.py b/modal_training_gym/common/models/__init__.py index f2401e3a..8316d4ef 100644 --- a/modal_training_gym/common/models/__init__.py +++ b/modal_training_gym/common/models/__init__.py @@ -3,6 +3,8 @@ ModelArchitecture, ModelConfig, ) +from .glm_4_7 import GLM_4_7 +from .glm_4_7_flash import GLM_4_7_Flash from .qwen3_0_6b import Qwen3_0_6B from .qwen3_1_7b import Qwen3_1_7B from .qwen3_4b import Qwen3_4B @@ -12,6 +14,8 @@ from .qwen3_32b import Qwen3_32B __all__ = [ + "GLM_4_7", + "GLM_4_7_Flash", "HFModelConfiguration", "ModelArchitecture", "ModelConfig", diff --git a/modal_training_gym/common/models/base.py b/modal_training_gym/common/models/base.py index f8123bcc..5cafad13 100644 --- a/modal_training_gym/common/models/base.py +++ b/modal_training_gym/common/models/base.py @@ -75,6 +75,13 @@ class ModelArchitecture: use_rotary_position_embeddings: bool = True rotary_base: int = 10000 + # MoE (Mixture of Experts) + num_experts: int = 0 + moe_router_topk: int = 0 + moe_ffn_hidden_size: int = 0 + num_shared_experts: int = 0 + first_k_dense_replace: int = 0 + def to_megatron_args(self) -> list[str]: """Generate Megatron-LM CLI flags from this architecture spec.""" args: list[str] = [] @@ -111,6 +118,16 @@ def to_megatron_args(self) -> list[str]: args += ["--position-embedding-type", "rope"] if self.rotary_base != 10000: args += ["--rotary-base", str(self.rotary_base)] + if self.num_experts: + args += ["--num-experts", str(self.num_experts)] + if self.moe_router_topk: + args += ["--moe-router-topk", str(self.moe_router_topk)] + if self.moe_ffn_hidden_size: + args += ["--moe-ffn-hidden-size", str(self.moe_ffn_hidden_size)] + if self.num_shared_experts: + args += ["--num-shared-experts", str(self.num_shared_experts)] + if self.first_k_dense_replace: + args += ["--first-k-dense-replace", str(self.first_k_dense_replace)] return args diff --git a/modal_training_gym/common/models/glm_4_7.py b/modal_training_gym/common/models/glm_4_7.py new file mode 100644 index 00000000..8e375a5e --- /dev/null +++ b/modal_training_gym/common/models/glm_4_7.py @@ -0,0 +1,38 @@ +"""GLM-4.7 (355B-A32B MoE) model spec.""" + +from .base import HFModelConfiguration, ModelArchitecture + + +class GLM_4_7(HFModelConfiguration): + """GLM-4.7 (355B total, 32B active) MoE from Zhipu AI. + + 160 routed experts with top-8 routing plus 1 shared expert. + First 3 layers are dense; remaining 89 are MoE. + Uses GQA (96 Q heads, 8 KV heads) with partial RoPE. + Downloads from ``zai-org/GLM-4.7`` on HuggingFace. + """ + + model_name = "zai-org/GLM-4.7" + architecture = ModelArchitecture( + num_layers=92, + hidden_size=5120, + ffn_hidden_size=12288, + num_attention_heads=96, + group_query_attention=True, + num_query_groups=8, + kv_channels=128, + vocab_size=151552, + normalization="RMSNorm", + norm_epsilon=1e-5, + swiglu=True, + disable_bias_linear=False, + qk_layernorm=True, + untie_embeddings_and_output_weights=True, + use_rotary_position_embeddings=True, + rotary_base=1000000, + num_experts=160, + moe_router_topk=8, + moe_ffn_hidden_size=1536, + num_shared_experts=1, + first_k_dense_replace=3, + ) diff --git a/modal_training_gym/common/models/glm_4_7_flash.py b/modal_training_gym/common/models/glm_4_7_flash.py new file mode 100644 index 00000000..6bb84538 --- /dev/null +++ b/modal_training_gym/common/models/glm_4_7_flash.py @@ -0,0 +1,36 @@ +"""GLM-4.7-Flash (30B-A3B MoE) model spec.""" + +from .base import HFModelConfiguration, ModelArchitecture + + +class GLM_4_7_Flash(HFModelConfiguration): + """GLM-4.7-Flash (30B total, 3B active) MoE from Zhipu AI. + + 64 routed experts with top-4 routing plus 1 shared expert. + Uses Multi-head Latent Attention (MLA) and multi-token prediction. + Downloads from ``zai-org/GLM-4.7-Flash`` on HuggingFace. + """ + + model_name = "zai-org/GLM-4.7-Flash" + architecture = ModelArchitecture( + num_layers=47, + hidden_size=2048, + ffn_hidden_size=10240, + num_attention_heads=20, + group_query_attention=False, + num_query_groups=20, + kv_channels=128, + vocab_size=154880, + normalization="RMSNorm", + norm_epsilon=1e-5, + swiglu=True, + disable_bias_linear=True, + qk_layernorm=True, + use_rotary_position_embeddings=True, + rotary_base=1000000, + num_experts=64, + moe_router_topk=4, + moe_ffn_hidden_size=1536, + num_shared_experts=1, + first_k_dense_replace=1, + ) diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py b/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py index 173da428..faae4f75 100644 --- a/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py +++ b/modal_training_gym/deploy_recipes/sglang_recipe/__init__.py @@ -1,4 +1,10 @@ from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe +from modal_training_gym.deploy_recipes.sglang_recipe.glm_4_7 import ( + GLM_4_7_SglangRecipe, +) +from modal_training_gym.deploy_recipes.sglang_recipe.glm_4_7_flash import ( + GLM_4_7_Flash_SglangRecipe, +) from modal_training_gym.deploy_recipes.sglang_recipe.qwen3_0_6b import ( Qwen3_0_6b_SglangRecipe, ) @@ -22,6 +28,8 @@ ) __all__ = [ + "GLM_4_7_SglangRecipe", + "GLM_4_7_Flash_SglangRecipe", "SglangRecipe", "Qwen3_0_6b_SglangRecipe", "Qwen3_1_7b_SglangRecipe", diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py new file mode 100644 index 00000000..84383006 --- /dev/null +++ b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7.py @@ -0,0 +1,26 @@ +from dataclasses import dataclass + +from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe + +_GLM_4_7_DEFAULTS = { + "gpu": "H100", + "tp": 8, + "context_length": 32768, + "mem_fraction_static": 0.80, + "chunked_prefill_size": 8192, + "max_running_requests": 16, + "extra_server_args": {"--trust-remote-code": ""}, +} + + +_SGLANG_DEFAULTS = SglangRecipe() + + +@dataclass +class GLM_4_7_SglangRecipe(SglangRecipe): + """GLM-4.7 (355B) on 8×H100 — tensor-parallel MoE serving.""" + + def __post_init__(self) -> None: + for key, val in _GLM_4_7_DEFAULTS.items(): + if getattr(self, key) == getattr(_SGLANG_DEFAULTS, key): + object.__setattr__(self, key, val) diff --git a/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py new file mode 100644 index 00000000..d248b63a --- /dev/null +++ b/modal_training_gym/deploy_recipes/sglang_recipe/glm_4_7_flash.py @@ -0,0 +1,27 @@ +from dataclasses import dataclass + +from modal_training_gym.deploy_recipes.sglang_recipe.recipe import SglangRecipe + +_GLM_4_7_FLASH_DEFAULTS = { + "gpu": "H100", + "tp": 1, + "dp": 8, + "context_length": 32768, + "mem_fraction_static": 0.80, + "chunked_prefill_size": 8192, + "max_running_requests": 16, + "extra_server_args": {"--trust-remote-code": ""}, +} + + +_SGLANG_DEFAULTS = SglangRecipe() + + +@dataclass +class GLM_4_7_Flash_SglangRecipe(SglangRecipe): + """GLM-4.7-Flash on 8×H100 — DP-attention MoE serving.""" + + def __post_init__(self) -> None: + for key, val in _GLM_4_7_FLASH_DEFAULTS.items(): + if getattr(self, key) == getattr(_SGLANG_DEFAULTS, key): + object.__setattr__(self, key, val) diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py b/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py index cdcb8a99..34091d67 100644 --- a/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py +++ b/modal_training_gym/deploy_recipes/vllm_recipe/__init__.py @@ -1,4 +1,10 @@ from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe +from modal_training_gym.deploy_recipes.vllm_recipe.glm_4_7 import ( + GLM_4_7_VllmRecipe, +) +from modal_training_gym.deploy_recipes.vllm_recipe.glm_4_7_flash import ( + GLM_4_7_Flash_VllmRecipe, +) from modal_training_gym.deploy_recipes.vllm_recipe.qwen3_0_6b import ( Qwen3_0_6b_VllmRecipe, ) @@ -12,6 +18,8 @@ from modal_training_gym.deploy_recipes.vllm_recipe.qwen3_32b import Qwen3_32b_VllmRecipe __all__ = [ + "GLM_4_7_VllmRecipe", + "GLM_4_7_Flash_VllmRecipe", "VllmRecipe", "Qwen3_0_6b_VllmRecipe", "Qwen3_1_7b_VllmRecipe", diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py new file mode 100644 index 00000000..09010cf5 --- /dev/null +++ b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass + +from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe + +_GLM_4_7_DEFAULTS = { + "gpu": "H100", + "n_gpu": 8, + "extra_vllm_args": ["--trust-remote-code"], +} + +_VLLM_DEFAULTS = VllmRecipe() + + +@dataclass +class GLM_4_7_VllmRecipe(VllmRecipe): + """GLM-4.7 (355B) on 8×H100 — tensor-parallel MoE serving.""" + + def __post_init__(self) -> None: + for key, val in _GLM_4_7_DEFAULTS.items(): + if getattr(self, key) == getattr(_VLLM_DEFAULTS, key): + object.__setattr__(self, key, val) diff --git a/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py new file mode 100644 index 00000000..50cc52c4 --- /dev/null +++ b/modal_training_gym/deploy_recipes/vllm_recipe/glm_4_7_flash.py @@ -0,0 +1,21 @@ +from dataclasses import dataclass + +from modal_training_gym.deploy_recipes.vllm_recipe.recipe import VllmRecipe + +_GLM_4_7_FLASH_DEFAULTS = { + "gpu": "H100", + "n_gpu": 2, + "extra_vllm_args": ["--trust-remote-code"], +} + +_VLLM_DEFAULTS = VllmRecipe() + + +@dataclass +class GLM_4_7_Flash_VllmRecipe(VllmRecipe): + """GLM-4.7-Flash on 2×H100 — tensor-parallel MoE serving.""" + + def __post_init__(self) -> None: + for key, val in _GLM_4_7_FLASH_DEFAULTS.items(): + if getattr(self, key) == getattr(_VLLM_DEFAULTS, key): + object.__setattr__(self, key, val) diff --git a/modal_training_gym/train_recipes/slime_recipe/__init__.py b/modal_training_gym/train_recipes/slime_recipe/__init__.py index 2c8e5635..7487668b 100644 --- a/modal_training_gym/train_recipes/slime_recipe/__init__.py +++ b/modal_training_gym/train_recipes/slime_recipe/__init__.py @@ -3,6 +3,10 @@ SlimeRecipeBlock, ) from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe +from modal_training_gym.train_recipes.slime_recipe.glm_4_7 import GLM_4_7_Recipe +from modal_training_gym.train_recipes.slime_recipe.glm_4_7_flash import ( + GLM_4_7_Flash_Recipe, +) from modal_training_gym.train_recipes.slime_recipe.qwen3_1_7b import Qwen3_1_7b_Recipe from modal_training_gym.train_recipes.slime_recipe.qwen3_8b import Qwen3_8b_Recipe from modal_training_gym.train_recipes.slime_recipe.qwen3_14b import Qwen3_14b_Recipe @@ -10,6 +14,8 @@ from modal_training_gym.train_recipes.slime_recipe.qwen3_4b import Qwen3_4b_Recipe __all__ = [ + "GLM_4_7_Recipe", + "GLM_4_7_Flash_Recipe", "MultiTurn", "SlimeRecipe", "SlimeRecipeBlock", diff --git a/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py b/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py new file mode 100644 index 00000000..2de0060a --- /dev/null +++ b/modal_training_gym/train_recipes/slime_recipe/glm_4_7.py @@ -0,0 +1,51 @@ +from pydantic import ConfigDict +from pydantic.dataclasses import dataclass + +from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe + + +@dataclass(config=ConfigDict(extra="forbid", arbitrary_types_allowed=True)) +class GLM_4_7_Recipe(SlimeRecipe): + """GLM-4.7 (355B-A32B MoE) on 8×8×H100, colocated GRPO. + + TP=8, PP=4, CP=2, EP=16 across 8 nodes (64 GPUs). + Uses CPU optimizer offloading for the large parameter count. + """ + + gpu_type: str = "H100" + colocate: bool = True + actor_num_nodes: int = 8 + actor_num_gpus_per_node: int = 8 + tensor_model_parallel_size: int = 8 + sequence_parallel: bool = True + rollout_num_gpus_per_engine: int = 32 + + # MoE parallelism + expert_model_parallel_size: int = 16 + expert_tensor_parallel_size: int = 1 + pipeline_model_parallel_size: int = 4 + context_parallel_size: int = 2 + attention_backend: str | None = "flash" + + # Rollout + num_rollout: int = 1 + rollout_batch_size: int = 64 + rollout_max_response_len: int = 4096 + rollout_temperature: float = 1.0 + sglang_mem_fraction_static: float = 0.70 + + save_interval: int = 10 + + # Training + n_samples_per_prompt: int = 8 + global_batch_size: int = 512 + lr: float = 1e-6 + max_tokens_per_gpu: int = 16384 + + # Optimizer offloading (required for 355B model) + optimizer_cpu_offload: bool = True + overlap_cpu_optimizer_d2h_h2d: bool = True + use_precision_aware_optimizer: bool = True + + eval_interval: int | None = 10 + eval_max_response_len: int = 4096 diff --git a/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py b/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py new file mode 100644 index 00000000..90ee6745 --- /dev/null +++ b/modal_training_gym/train_recipes/slime_recipe/glm_4_7_flash.py @@ -0,0 +1,56 @@ +from pydantic import ConfigDict +from pydantic.dataclasses import dataclass + +from modal_training_gym.train_recipes.slime_recipe.recipe import SlimeRecipe + + +@dataclass(config=ConfigDict(extra="forbid", arbitrary_types_allowed=True)) +class GLM_4_7_Flash_Recipe(SlimeRecipe): + """GLM-4.7-Flash (30B-A3B MoE) on 1×8×H100, colocated GRPO. + + TP=1, PP=1, EP=8 fits on a single 8-GPU node. + Uses MTP training, DeepEP, and CPU optimizer offloading. + """ + + gpu_type: str = "H100" + colocate: bool = True + tensor_model_parallel_size: int = 1 + sequence_parallel: bool = False + rollout_num_gpus_per_engine: int = 8 + + # MoE parallelism + expert_model_parallel_size: int = 8 + expert_tensor_parallel_size: int = 1 + pipeline_model_parallel_size: int = 1 + context_parallel_size: int = 1 + moe_token_dispatcher_type: str | None = "flex" + moe_enable_deepep: bool = True + + # MTP + enable_mtp_training: bool = True + mtp_num_layers: int = 1 + mtp_loss_scaling_factor: float = 0.2 + + # Rollout + num_rollout: int = 1 + rollout_batch_size: int = 64 + rollout_max_response_len: int = 4096 + rollout_temperature: float = 1.0 + sglang_mem_fraction_static: float = 0.70 + + save_interval: int = 10 + + # Training + n_samples_per_prompt: int = 8 + global_batch_size: int = 512 + lr: float = 1e-6 + max_tokens_per_gpu: int = 32768 + attention_backend: str | None = "flash" + + # Optimizer offloading (MoE models are memory-heavy) + optimizer_cpu_offload: bool = True + overlap_cpu_optimizer_d2h_h2d: bool = True + use_precision_aware_optimizer: bool = True + + eval_interval: int | None = 10 + eval_max_response_len: int = 4096 diff --git a/modal_training_gym/train_recipes/slime_recipe/recipe.py b/modal_training_gym/train_recipes/slime_recipe/recipe.py index 46ad418f..a68bc8a7 100644 --- a/modal_training_gym/train_recipes/slime_recipe/recipe.py +++ b/modal_training_gym/train_recipes/slime_recipe/recipe.py @@ -115,6 +115,25 @@ class SlimeRecipe(BaseTrainRecipe): adam_beta2: float = 0.98 optimizer: str = "adam" + # ── MoE parallelism ────────────────────────────────────────────────────── + expert_model_parallel_size: int = 1 + expert_tensor_parallel_size: int = 1 + pipeline_model_parallel_size: int = 1 + context_parallel_size: int = 1 + moe_token_dispatcher_type: str | None = None + moe_enable_deepep: bool = False + + # ── Multi-token prediction (MTP) ───────────────────────────────────── + enable_mtp_training: bool = False + mtp_num_layers: int = 0 + mtp_loss_scaling_factor: float = 0.0 + + # ── Optimizer offloading ────────────────────────────────────────────── + optimizer_cpu_offload: bool = False + overlap_cpu_optimizer_d2h_h2d: bool = False + use_precision_aware_optimizer: bool = False + attention_backend: str | None = None + # ── Memory and precision ──────────────────────────────────────────────── attention_dropout: float = 0.0 hidden_dropout: float = 0.0 @@ -252,7 +271,7 @@ def _validate_dataset(ds: "DatasetConfig") -> None: @staticmethod def _model_to_fields(m: "ModelConfig") -> dict[str, Any]: arch = SlimeRecipe._validate_custom_model_architecture(m) - return { + fields = { "hf_checkpoint": m.model_path or m.model_name, "num_layers": arch.num_layers, "hidden_size": arch.hidden_size, @@ -271,6 +290,17 @@ def _model_to_fields(m: "ModelConfig") -> dict[str, Any]: "use_rotary_position_embeddings": arch.use_rotary_position_embeddings, "rotary_base": arch.rotary_base, } + if arch.num_experts: + fields["num_experts"] = arch.num_experts + if arch.moe_router_topk: + fields["moe_router_topk"] = arch.moe_router_topk + if arch.moe_ffn_hidden_size: + fields["moe_ffn_hidden_size"] = arch.moe_ffn_hidden_size + if arch.num_shared_experts: + fields["num_shared_experts"] = arch.num_shared_experts + if arch.first_k_dense_replace: + fields["first_k_dense_replace"] = arch.first_k_dense_replace + return fields @staticmethod def _wandb_to_fields(w: "WandbConfig") -> dict[str, Any]: @@ -388,4 +418,16 @@ def get_base_recipe(cls, model_config: ModelConfig) -> "SlimeRecipe | None": return Qwen3_14b_Recipe() if model_config.model_name == "Qwen/Qwen3-32B": return Qwen3_32b_Recipe() + if model_config.model_name == "zai-org/GLM-4.7": + from modal_training_gym.train_recipes.slime_recipe.glm_4_7 import ( + GLM_4_7_Recipe, + ) + + return GLM_4_7_Recipe() + if model_config.model_name == "zai-org/GLM-4.7-Flash": + from modal_training_gym.train_recipes.slime_recipe.glm_4_7_flash import ( + GLM_4_7_Flash_Recipe, + ) + + return GLM_4_7_Flash_Recipe() return None diff --git a/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb new file mode 100644 index 00000000..e9c79f21 --- /dev/null +++ b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.ipynb @@ -0,0 +1,434 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "id": "cell-000", + "metadata": {}, + "source": [ + "# Training GLM-4.7 on GSM8K\n", + "\n", + "This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7),\n", + "a 355B-parameter Mixture-of-Experts model (32B active per token),\n", + "on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k).\n", + "\n", + "GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing.\n", + "The first 3 layers are dense; the remaining 89 use sparse expert routing.\n", + "Training a model this size requires multi-node parallelism:\n", + "\n", + "1. Serving the base model with SGLang on 8\u00d7H100.\n", + "2. Loading the GSM8K dataset and defining a math-answer scorer.\n", + "3. Evaluating the base model.\n", + "4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward.\n", + "5. Evaluating the trained checkpoint and comparing.\n", + "\n", + "**Cluster shape:** 8 nodes \u00d7 8 H100 GPUs (64 GPUs total). Training uses\n", + "TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs." + ] + }, + { + "cell_type": "markdown", + "id": "cell-001", + "metadata": {}, + "source": [ + "## Prerequisites\n", + "\n", + "This tutorial requires a Modal Secret named `huggingface-secret` containing your\n", + "`HF_TOKEN`. Create one at [modal.com/secrets](https://modal.com/secrets) if you\n", + "haven't already \u2014 the cell below fails fast with instructions otherwise." + ] + }, + { + "cell_type": "markdown", + "id": "cell-002", + "metadata": {}, + "source": [ + "> **Note:** you do **not** need to attach a GPU to this notebook. All training and\n", + "> serving happens on Modal-managed GPU workers spun up by the SDK \u2014 the notebook\n", + "> itself only needs to issue API calls." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-003", + "metadata": {}, + "outputs": [], + "source": [ + "import modal\n", + "\n", + "try:\n", + " modal.Secret.from_name(\"huggingface-secret\").hydrate()\n", + "except modal.exception.NotFoundError as e:\n", + " raise RuntimeError(\n", + " \"Missing Modal Secret 'huggingface-secret'. Create one at \"\n", + " \"https://modal.com/secrets with an HF_TOKEN entry, then re-run.\"\n", + " ) from e" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-004", + "metadata": {}, + "outputs": [], + "source": [ + "%uv pip install -q git+https://github.com/modal-projects/training-gym.git@main" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-005", + "metadata": {}, + "outputs": [], + "source": [ + "import re\n", + "\n", + "from modal_training_gym import (\n", + " DeploymentConfig,\n", + " EvalConfig,\n", + " EvalRowResult,\n", + " GLM_4_7,\n", + " HuggingFaceDataset,\n", + " SlimeRecipe,\n", + " TrainConfig,\n", + " list_checkpoints,\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-006", + "metadata": {}, + "source": [ + "## Serve the base model\n", + "\n", + "First, let's deploy GLM-4.7 so we can test it.\n", + "The 355B model needs 8\u00d7H100 GPUs for serving with tensor parallelism." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-007", + "metadata": {}, + "outputs": [], + "source": [ + "base_model = GLM_4_7()\n", + "base_model_deployment = DeploymentConfig(\n", + " model=base_model,\n", + ").serve()\n", + "print(f\"Base model deployed to {base_model_deployment.url}\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-008", + "metadata": {}, + "source": [ + "Let's try asking it a math question to see how it responds." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-009", + "metadata": {}, + "outputs": [], + "source": [ + "response = base_model_deployment.generate(\n", + " \"What is 24 * 37?\",\n", + " chat_template_kwargs={\"enable_thinking\": False},\n", + ")\n", + "print(response)" + ] + }, + { + "cell_type": "markdown", + "id": "cell-010", + "metadata": {}, + "source": [ + "## Define the GSM8K dataset\n", + "\n", + "GSM8K contains grade-school math word problems. Each row has a `question`\n", + "and an `answer` that ends with `#### ` \u2014 the final numerical answer.\n", + "\n", + "We'll use the answer extraction pattern to score the model: extract the\n", + "number after `####` from the label and compare it to the model's final number." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-011", + "metadata": {}, + "outputs": [], + "source": [ + "class GSM8KDataset(HuggingFaceDataset):\n", + " hf_repo = \"openai/gsm8k\"\n", + " hf_config = \"main\"\n", + " input_column = \"question\"\n", + " output_column = \"answer\"\n", + " output_format = \"jsonl\"\n", + " apply_chat_template = True\n", + " system_prompt = (\n", + " \"You are a math problem solver. Solve the given math problem \"\n", + " \"step by step, then give your final answer as a number on the \"\n", + " \"last line prefixed with ####. For example: #### 42\"\n", + " )\n", + " always_prepare = True\n", + "\n", + "train_dataset = GSM8KDataset(n_rows=50)\n", + "eval_dataset = GSM8KDataset(n_rows=20, hf_split=\"test\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-012", + "metadata": {}, + "source": [ + "Let's look at a few examples from the dataset." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-013", + "metadata": {}, + "outputs": [], + "source": [ + "df = eval_dataset.to_pandas()\n", + "print(f\"Eval set: {len(df)} rows\")\n", + "df.head(3)" + ] + }, + { + "cell_type": "markdown", + "id": "cell-014", + "metadata": {}, + "source": [ + "## Define the scoring function\n", + "\n", + "GSM8K answers end with `#### `. We extract the number from both\n", + "the label and the model's response, then check if they match." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-015", + "metadata": {}, + "outputs": [], + "source": [ + "def extract_answer(text: str) -> str | None:\n", + " match = re.search(r\"####\\s*(-?[\\d,]+(?:\\.\\d+)?)\", text)\n", + " if match:\n", + " return match.group(1).replace(\",\", \"\").strip()\n", + " numbers = re.findall(r\"-?[\\d,]+(?:\\.\\d+)?\", text)\n", + " if numbers:\n", + " return numbers[-1].replace(\",\", \"\").strip()\n", + " return None\n", + "\n", + "def gsm8k_score(example: dict, response: str) -> EvalRowResult:\n", + " expected = extract_answer(example.get(\"answer\", \"\"))\n", + " predicted = extract_answer(response)\n", + " correct = expected is not None and predicted is not None and expected == predicted\n", + " return EvalRowResult(\n", + " score=1.0 if correct else 0.0,\n", + " response=response,\n", + " )" + ] + }, + { + "cell_type": "markdown", + "id": "cell-016", + "metadata": {}, + "source": [ + "## Evaluate the base model" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-017", + "metadata": {}, + "outputs": [], + "source": [ + "eval_config = EvalConfig(\n", + " dataset=eval_dataset,\n", + " eval_response_fn=gsm8k_score,\n", + " generate_kwargs={\"chat_template_kwargs\": {\"enable_thinking\": False}},\n", + ")\n", + "print(\"--- Running base model evaluation... ---\")\n", + "base_eval = eval_config.evaluate(base_model_deployment, debug=True)\n", + "print(f\"Base model GSM8K accuracy: {base_eval.mean:.1%}\")\n", + "print(\"--- Base model evaluation complete ---\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-018", + "metadata": {}, + "source": [ + "## Train with slime\n", + "\n", + "Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math`\n", + "reward type. This reward extracts the final number from the model's\n", + "response and checks it against the label \u2014 the same logic as our eval\n", + "scoring function.\n", + "\n", + "Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism:\n", + "- **TP=8:** tensor parallelism across all GPUs in a node.\n", + "- **PP=4:** pipeline parallelism across 4 stages.\n", + "- **EP=16:** expert parallelism shards the 160 experts across 16 groups.\n", + "- **CP=2:** context parallelism for long sequences.\n", + "- **CPU optimizer offload:** frees GPU memory for the large parameter count." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-019", + "metadata": {}, + "outputs": [], + "source": [ + "training_run = TrainConfig(\n", + " model=base_model,\n", + " dataset=train_dataset,\n", + " recipe=SlimeRecipe(\n", + " rm_type=\"math\",\n", + "\n", + " gpu_type=\"H100\",\n", + " colocate=True,\n", + " actor_num_nodes=8,\n", + " actor_num_gpus_per_node=8,\n", + " tensor_model_parallel_size=8,\n", + " sequence_parallel=True,\n", + " rollout_num_gpus_per_engine=32,\n", + "\n", + " expert_model_parallel_size=16,\n", + " expert_tensor_parallel_size=1,\n", + " pipeline_model_parallel_size=4,\n", + " context_parallel_size=2,\n", + " attention_backend=\"flash\",\n", + "\n", + " num_rollout=10,\n", + " rollout_batch_size=64,\n", + " rollout_max_response_len=4096,\n", + " rollout_temperature=1.0,\n", + " sglang_mem_fraction_static=0.70,\n", + "\n", + " n_samples_per_prompt=8,\n", + " global_batch_size=512,\n", + " max_tokens_per_gpu=16384,\n", + "\n", + " optimizer_cpu_offload=True,\n", + " overlap_cpu_optimizer_d2h_h2d=True,\n", + " use_precision_aware_optimizer=True,\n", + "\n", + " save_interval=5,\n", + " apply_chat_template_kwargs='{\"enable_thinking\": false}',\n", + " ),\n", + ")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-020", + "metadata": {}, + "source": [ + "## Launch training\n", + "\n", + "`TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster\n", + "(8 nodes \u00d7 8 H100), runs GRPO training, and returns a `TrainResult`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-021", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"--- Running training... ---\")\n", + "train_result = training_run.train()\n", + "print(\"--- Training complete ---\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-022", + "metadata": {}, + "source": [ + "## Serve and evaluate the trained checkpoint\n", + "\n", + "Let's deploy the trained checkpoint and run the same GSM8K eval." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-023", + "metadata": {}, + "outputs": [], + "source": [ + "checkpoint = list_checkpoints(train_result.training_run_id)[-1]\n", + "print(f\"Checkpoint: {checkpoint.path}\")\n", + "\n", + "trained_model_deployment = DeploymentConfig(\n", + " model=GLM_4_7(),\n", + " checkpoint=checkpoint,\n", + " app_name=\"glm-4.7-gsm8k-serve\",\n", + " served_model_name=\"glm-4.7-gsm8k\",\n", + ").serve()\n", + "print(f\"Trained model deployed to {trained_model_deployment.url}\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-024", + "metadata": {}, + "outputs": [], + "source": [ + "print(\"--- Running trained model evaluation... ---\")\n", + "trained_eval = eval_config.evaluate(trained_model_deployment, debug=True)\n", + "print(f\"Trained model GSM8K accuracy: {trained_eval.mean:.1%}\")\n", + "print(\"--- Trained model evaluation complete ---\")" + ] + }, + { + "cell_type": "markdown", + "id": "cell-025", + "metadata": {}, + "source": [ + "## Compare results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "cell-026", + "metadata": {}, + "outputs": [], + "source": [ + "print(f\"Base model GSM8K accuracy: {base_eval.mean:.1%}\")\n", + "print(f\"Trained model GSM8K accuracy: {trained_eval.mean:.1%}\")\n", + "improvement = trained_eval.mean - base_eval.mean\n", + "print(f\"Improvement: {improvement:+.1%}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} diff --git a/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py new file mode 100644 index 00000000..55bf855b --- /dev/null +++ b/tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py @@ -0,0 +1,223 @@ +# Generated by generate_tutorial.py — do not edit directly. +# Source: tutorials/tutorial_generator/rl/003_glm_gsm8k.py + +# # Training GLM-4.7 on GSM8K +# +# This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7), +# a 355B-parameter Mixture-of-Experts model (32B active per token), +# on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k). +# +# GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing. +# The first 3 layers are dense; the remaining 89 use sparse expert routing. +# Training a model this size requires multi-node parallelism: +# +# 1. Serving the base model with SGLang on 8×H100. +# 2. Loading the GSM8K dataset and defining a math-answer scorer. +# 3. Evaluating the base model. +# 4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward. +# 5. Evaluating the trained checkpoint and comparing. +# +# **Cluster shape:** 8 nodes × 8 H100 GPUs (64 GPUs total). Training uses +# TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs. +# To run the tutorial, run the following command: +# ``` +# uv run python tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py +# ``` +# ## Prerequisites +# +# This tutorial requires a Modal Secret named `huggingface-secret` containing your +# `HF_TOKEN`. Create one at [modal.com/secrets](https://modal.com/secrets) if you +# haven't already — the cell below fails fast with instructions otherwise. + +import modal + +import re + +from modal_training_gym import ( + DeploymentConfig, + EvalConfig, + EvalRowResult, + GLM_4_7, + HuggingFaceDataset, + SlimeRecipe, + TrainConfig, + list_checkpoints, +) + +# ## Define the GSM8K dataset +# +# GSM8K contains grade-school math word problems. Each row has a `question` +# and an `answer` that ends with `#### ` — the final numerical answer. +# +# We'll use the answer extraction pattern to score the model: extract the +# number after `####` from the label and compare it to the model's final number. + +class GSM8KDataset(HuggingFaceDataset): + hf_repo = "openai/gsm8k" + hf_config = "main" + input_column = "question" + output_column = "answer" + output_format = "jsonl" + apply_chat_template = True + system_prompt = ( + "You are a math problem solver. Solve the given math problem " + "step by step, then give your final answer as a number on the " + "last line prefixed with ####. For example: #### 42" + ) + always_prepare = True + +# ## Define the scoring function +# +# GSM8K answers end with `#### `. We extract the number from both +# the label and the model's response, then check if they match. + +def extract_answer(text: str) -> str | None: + match = re.search(r"####\s*(-?[\d,]+(?:\.\d+)?)", text) + if match: + return match.group(1).replace(",", "").strip() + numbers = re.findall(r"-?[\d,]+(?:\.\d+)?", text) + if numbers: + return numbers[-1].replace(",", "").strip() + return None + +def gsm8k_score(example: dict, response: str) -> EvalRowResult: + expected = extract_answer(example.get("answer", "")) + predicted = extract_answer(response) + correct = expected is not None and predicted is not None and expected == predicted + return EvalRowResult( + score=1.0 if correct else 0.0, + response=response, + ) + +import modal + +tutorial_cli_app = modal.App() + +def _main_impl() -> None: + try: + modal.Secret.from_name("huggingface-secret").hydrate() + except modal.exception.NotFoundError as e: + raise RuntimeError( + "Missing Modal Secret 'huggingface-secret'. Create one at " + "https://modal.com/secrets with an HF_TOKEN entry, then re-run." + ) from e + + # ## Serve the base model + # + # First, let's deploy GLM-4.7 so we can test it. + # The 355B model needs 8×H100 GPUs for serving with tensor parallelism. + + base_model = GLM_4_7() + base_model_deployment = DeploymentConfig( + model=base_model, + ).serve() + print(f"Base model deployed to {base_model_deployment.url}") + + train_dataset = GSM8KDataset(n_rows=50) + eval_dataset = GSM8KDataset(n_rows=20, hf_split="test") + + eval_config = EvalConfig( + dataset=eval_dataset, + eval_response_fn=gsm8k_score, + generate_kwargs={"chat_template_kwargs": {"enable_thinking": False}}, + ) + print("--- Running base model evaluation... ---") + base_eval = eval_config.evaluate(base_model_deployment, debug=True) + print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}") + print("--- Base model evaluation complete ---") + + # ## Train with slime + # + # Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math` + # reward type. This reward extracts the final number from the model's + # response and checks it against the label — the same logic as our eval + # scoring function. + # + # Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism: + # - **TP=8:** tensor parallelism across all GPUs in a node. + # - **PP=4:** pipeline parallelism across 4 stages. + # - **EP=16:** expert parallelism shards the 160 experts across 16 groups. + # - **CP=2:** context parallelism for long sequences. + # - **CPU optimizer offload:** frees GPU memory for the large parameter count. + + training_run = TrainConfig( + model=base_model, + dataset=train_dataset, + recipe=SlimeRecipe( + rm_type="math", + + gpu_type="H100", + colocate=True, + actor_num_nodes=8, + actor_num_gpus_per_node=8, + tensor_model_parallel_size=8, + sequence_parallel=True, + rollout_num_gpus_per_engine=32, + + expert_model_parallel_size=16, + expert_tensor_parallel_size=1, + pipeline_model_parallel_size=4, + context_parallel_size=2, + attention_backend="flash", + + num_rollout=10, + rollout_batch_size=64, + rollout_max_response_len=4096, + rollout_temperature=1.0, + sglang_mem_fraction_static=0.70, + + n_samples_per_prompt=8, + global_batch_size=512, + max_tokens_per_gpu=16384, + + optimizer_cpu_offload=True, + overlap_cpu_optimizer_d2h_h2d=True, + use_precision_aware_optimizer=True, + + save_interval=5, + apply_chat_template_kwargs='{"enable_thinking": false}', + ), + ) + + # ## Launch training + # + # `TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster + # (8 nodes × 8 H100), runs GRPO training, and returns a `TrainResult`. + + print("--- Running training... ---") + train_result = training_run.train() + print("--- Training complete ---") + + # ## Serve and evaluate the trained checkpoint + # + # Let's deploy the trained checkpoint and run the same GSM8K eval. + + checkpoint = list_checkpoints(train_result.training_run_id)[-1] + print(f"Checkpoint: {checkpoint.path}") + + trained_model_deployment = DeploymentConfig( + model=GLM_4_7(), + checkpoint=checkpoint, + app_name="glm-4.7-gsm8k-serve", + served_model_name="glm-4.7-gsm8k", + ).serve() + print(f"Trained model deployed to {trained_model_deployment.url}") + + print("--- Running trained model evaluation... ---") + trained_eval = eval_config.evaluate(trained_model_deployment, debug=True) + print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}") + print("--- Trained model evaluation complete ---") + + # ## Compare results + + print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}") + print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}") + improvement = trained_eval.mean - base_eval.mean + print(f"Improvement: {improvement:+.1%}") + +@tutorial_cli_app.local_entrypoint() +def main() -> None: + _main_impl() + +if __name__ == "__main__": + main() diff --git a/tutorials/tutorial_generator/rl/003_glm_gsm8k.py b/tutorials/tutorial_generator/rl/003_glm_gsm8k.py new file mode 100644 index 00000000..c69081cf --- /dev/null +++ b/tutorials/tutorial_generator/rl/003_glm_gsm8k.py @@ -0,0 +1,340 @@ +"""Tutorial source for `003_glm_gsm8k` — parsed by generate_tutorial.py.""" + +TUTORIAL_METADATA = { + "framework": "`slime`", + "cluster_shape": "8 × 8×H100", + "summary": "GLM-4.7 (355B MoE) on GSM8K math — serve, evaluate, GRPO-train, compare", + "difficulty": "Advanced", + "order": 30, + "api_classes": [ + "GLM_4_7", + "DeploymentConfig", + "EvalConfig", + "EvalRowResult", + "TrainConfig", + "SlimeRecipe", + "TrainResult", + ], +} + + +from tutorial_generator import code, markdown, notebook_only, py_only, shell + + +@markdown +def _intro(): + """ + # Training GLM-4.7 on GSM8K + + This tutorial trains [GLM-4.7](https://huggingface.co/zai-org/GLM-4.7), + a 355B-parameter Mixture-of-Experts model (32B active per token), + on grade-school math problems from [GSM8K](https://huggingface.co/datasets/openai/gsm8k). + + GLM-4.7 is a large MoE model with 160 routed experts and top-8 routing. + The first 3 layers are dense; the remaining 89 use sparse expert routing. + Training a model this size requires multi-node parallelism: + + 1. Serving the base model with SGLang on 8×H100. + 2. Loading the GSM8K dataset and defining a math-answer scorer. + 3. Evaluating the base model. + 4. GRPO-training with [slime](https://github.com/THUDM/slime) using the built-in math reward. + 5. Evaluating the trained checkpoint and comparing. + + **Cluster shape:** 8 nodes × 8 H100 GPUs (64 GPUs total). Training uses + TP=8, PP=4, CP=2, EP=16 to shard the model across all GPUs. + """ + + +@py_only +@markdown +def run_instructions(): + """ + To run the tutorial, run the following command: + ``` + uv run python tutorials/rl/003_glm_gsm8k/003_glm_gsm8k.py + ``` + """ + + +@notebook_only +@shell("%uv pip install -q git+https://github.com/modal-projects/training-gym.git@main") +def _install(): + pass + + +@code +def _imports(): + import re + + from modal_training_gym import ( + DeploymentConfig, + EvalConfig, + EvalRowResult, + GLM_4_7, + HuggingFaceDataset, + SlimeRecipe, + TrainConfig, + list_checkpoints, + ) + + +@markdown +def _serve_base_intro(): + """ + ## Serve the base model + + First, let's deploy GLM-4.7 so we can test it. + The 355B model needs 8×H100 GPUs for serving with tensor parallelism. + """ + + +@code +def _serve_base_model(): + base_model = GLM_4_7() + base_model_deployment = DeploymentConfig( + model=base_model, + ).serve() + print(f"Base model deployed to {base_model_deployment.url}") + + +@notebook_only +@markdown +def _try_base_model(): + """ + Let's try asking it a math question to see how it responds. + """ + + +@notebook_only +@code +def _try_base_model_code(): + response = base_model_deployment.generate( + "What is 24 * 37?", + chat_template_kwargs={"enable_thinking": False}, + ) + print(response) + + +@markdown +def _dataset_intro(): + """ + ## Define the GSM8K dataset + + GSM8K contains grade-school math word problems. Each row has a `question` + and an `answer` that ends with `#### ` — the final numerical answer. + + We'll use the answer extraction pattern to score the model: extract the + number after `####` from the label and compare it to the model's final number. + """ + + +@code +def _define_dataset(): + class GSM8KDataset(HuggingFaceDataset): + hf_repo = "openai/gsm8k" + hf_config = "main" + input_column = "question" + output_column = "answer" + output_format = "jsonl" + apply_chat_template = True + system_prompt = ( + "You are a math problem solver. Solve the given math problem " + "step by step, then give your final answer as a number on the " + "last line prefixed with ####. For example: #### 42" + ) + always_prepare = True + + train_dataset = GSM8KDataset(n_rows=50) + eval_dataset = GSM8KDataset(n_rows=20, hf_split="test") + + +@notebook_only +@markdown +def _peek_dataset(): + """ + Let's look at a few examples from the dataset. + """ + + +@notebook_only +@code +def _peek_dataset_code(): + df = eval_dataset.to_pandas() + print(f"Eval set: {len(df)} rows") + df.head(3) + + +@markdown +def _scoring_intro(): + """ + ## Define the scoring function + + GSM8K answers end with `#### `. We extract the number from both + the label and the model's response, then check if they match. + """ + + +@code +def _define_scoring(): + def extract_answer(text: str) -> str | None: + match = re.search(r"####\s*(-?[\d,]+(?:\.\d+)?)", text) + if match: + return match.group(1).replace(",", "").strip() + numbers = re.findall(r"-?[\d,]+(?:\.\d+)?", text) + if numbers: + return numbers[-1].replace(",", "").strip() + return None + + def gsm8k_score(example: dict, response: str) -> EvalRowResult: + expected = extract_answer(example.get("answer", "")) + predicted = extract_answer(response) + correct = expected is not None and predicted is not None and expected == predicted + return EvalRowResult( + score=1.0 if correct else 0.0, + response=response, + ) + + +@notebook_only +@markdown +def _eval_base_intro(): + """ + ## Evaluate the base model + """ + + +@code +def _eval_base(): + eval_config = EvalConfig( + dataset=eval_dataset, + eval_response_fn=gsm8k_score, + generate_kwargs={"chat_template_kwargs": {"enable_thinking": False}}, + ) + print("--- Running base model evaluation... ---") + base_eval = eval_config.evaluate(base_model_deployment, debug=True) + print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}") + print("--- Base model evaluation complete ---") + + +@markdown +def _train_intro(): + """ + ## Train with slime + + Now let's GRPO-train GLM-4.7 on GSM8K using the built-in `math` + reward type. This reward extracts the final number from the model's + response and checks it against the label — the same logic as our eval + scoring function. + + Because GLM-4.7 is a 355B MoE model, the recipe uses heavy parallelism: + - **TP=8:** tensor parallelism across all GPUs in a node. + - **PP=4:** pipeline parallelism across 4 stages. + - **EP=16:** expert parallelism shards the 160 experts across 16 groups. + - **CP=2:** context parallelism for long sequences. + - **CPU optimizer offload:** frees GPU memory for the large parameter count. + """ + + +@code +def _define_training(): + training_run = TrainConfig( + model=base_model, + dataset=train_dataset, + recipe=SlimeRecipe( + rm_type="math", + + gpu_type="H100", + colocate=True, + actor_num_nodes=8, + actor_num_gpus_per_node=8, + tensor_model_parallel_size=8, + sequence_parallel=True, + rollout_num_gpus_per_engine=32, + + expert_model_parallel_size=16, + expert_tensor_parallel_size=1, + pipeline_model_parallel_size=4, + context_parallel_size=2, + attention_backend="flash", + + num_rollout=10, + rollout_batch_size=64, + rollout_max_response_len=4096, + rollout_temperature=1.0, + sglang_mem_fraction_static=0.70, + + n_samples_per_prompt=8, + global_batch_size=512, + max_tokens_per_gpu=16384, + + optimizer_cpu_offload=True, + overlap_cpu_optimizer_d2h_h2d=True, + use_precision_aware_optimizer=True, + + save_interval=5, + apply_chat_template_kwargs='{"enable_thinking": false}', + ), + ) + + +@markdown +def _launch_training(): + """ + ## Launch training + + `TrainConfig.train()` builds the Modal app, launches the 64-GPU cluster + (8 nodes × 8 H100), runs GRPO training, and returns a `TrainResult`. + """ + + +@code +def _run_training(): + print("--- Running training... ---") + train_result = training_run.train() + print("--- Training complete ---") + + +@markdown +def _eval_trained_intro(): + """ + ## Serve and evaluate the trained checkpoint + + Let's deploy the trained checkpoint and run the same GSM8K eval. + """ + + +@code +def _serve_trained(): + checkpoint = list_checkpoints(train_result.training_run_id)[-1] + print(f"Checkpoint: {checkpoint.path}") + + trained_model_deployment = DeploymentConfig( + model=GLM_4_7(), + checkpoint=checkpoint, + app_name="glm-4.7-gsm8k-serve", + served_model_name="glm-4.7-gsm8k", + ).serve() + print(f"Trained model deployed to {trained_model_deployment.url}") + + +@code +def _eval_trained(): + print("--- Running trained model evaluation... ---") + trained_eval = eval_config.evaluate(trained_model_deployment, debug=True) + print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}") + print("--- Trained model evaluation complete ---") + + +@markdown +def _compare_intro(): + """ + ## Compare results + """ + + +@code +def _compare(): + print(f"Base model GSM8K accuracy: {base_eval.mean:.1%}") + print(f"Trained model GSM8K accuracy: {trained_eval.mean:.1%}") + improvement = trained_eval.mean - base_eval.mean + print(f"Improvement: {improvement:+.1%}")