jthemphill · jthemphill · Apr 28, 2026
diff --git a/Justfile b/Justfile
@@ -70,34 +70,69 @@ install_training:
     @command -v uv > /dev/null || (echo "uv not found. Please install from https://docs.astral.sh/uv/" && exit 1)
     cd training && uv sync
 
-# Generate selfplay training data (traditional MCTS, high quality for bootstrapping)
-selfplay GAMES="100" PLAYOUTS="20000": install_cargo
-    mkdir -p training/artifacts
-    cargo run --release -p selfplay -- {{GAMES}} {{PLAYOUTS}} > training/artifacts/training_data.jsonl
-    @echo "Generated training data with auxiliary targets (ownership, score_diff)"
-
-# Generate selfplay training data using neural network guidance (faster, more games)
-selfplay_nn GAMES="200" PLAYOUTS="800": install_cargo
-    mkdir -p training/artifacts
-    cargo run --release -p selfplay -- {{GAMES}} {{PLAYOUTS}} --nn >> training/artifacts/training_data.jsonl
-    @echo "Generated NN-guided training data with auxiliary targets"
+# Generate one durable policy-v2 selfplay run with the production-equivalent uniform-prior baseline
+selfplay GAMES="100" PLAYOUTS="20000" MAX_SAMPLES="100000": install_cargo install_training
+    cd training && uv run generate_selfplay.py --games {{GAMES}} --playouts {{PLAYOUTS}} --max-samples {{MAX_SAMPLES}}
+
+# Generate one durable policy-v2 experimental selfplay run using neural network root priors
+selfplay_nn GAMES="200" PLAYOUTS="800" MAX_SAMPLES="100000" PRIOR_WEIGHT="0.01": install_cargo install_training
+    cd training && uv run generate_selfplay.py --games {{GAMES}} --playouts {{PLAYOUTS}} --nn --prior-weight {{PRIOR_WEIGHT}} --max-samples {{MAX_SAMPLES}}
+
+# Rebuild the active training_data.jsonl replay view from uniform teacher runs only
+replay MAX_SAMPLES="100000": install_training
+    cd training && uv run replay.py build --max-samples {{MAX_SAMPLES}}
+
+# Rebuild replay with explicitly selected teachers for experiments
+replay_experimental TEACHERS="uniform,nn_root" MAX_SAMPLES="100000": install_training
+    cd training && uv run replay.py build --max-samples {{MAX_SAMPLES}} --include-teachers {{TEACHERS}}
 
 # Train the model on existing data
 train_only EPOCHS="20": install_training
     cd training && uv run train.py --epochs {{EPOCHS}}
 
-# Run one iteration: selfplay + training (traditional MCTS for bootstrapping)
+# Evaluate trained policy priors against the uniform-prior production baseline
+evaluate_model PAIRS="200" PLAYOUTS="400" MIN_SCORE="0.53": install_cargo
+    cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}} --summary-only --min-score {{MIN_SCORE}}
+
+# Evaluate trained policy priors without enforcing a promotion gate
+evaluate_model_report PAIRS="100" PLAYOUTS="400": install_cargo
+    cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}}
+
+# Sweep root prior blend weights without enforcing a promotion gate
+evaluate_prior_sweep PAIRS="200" PLAYOUTS="400" WEIGHTS="0.00,0.02,0.05,0.08,0.10,0.15,0.25": install_cargo install_training
+    cd training && uv run prior_sweep.py --pairs {{PAIRS}} --playouts {{PLAYOUTS}} --weights {{WEIGHTS}}
+
+# Sanity-check evaluator by comparing uniform priors against uniform priors
+evaluate_uniform PAIRS="20" PLAYOUTS="200": install_cargo
+    cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}} --uniform-vs-uniform
+
+# Promote the current ONNX model to the browser-served artifact path after evaluation passes
+promote_model PAIRS="200" PLAYOUTS="400" MIN_SCORE="0.53": install_cargo
+    cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}} --summary-only --min-score {{MIN_SCORE}}
+    mkdir -p www/public/models
+    cp training/artifacts/model.onnx www/public/models/htmf-policy.onnx
+    @echo "Promoted training/artifacts/model.onnx to www/public/models/htmf-policy.onnx"
+
+# Run one iteration: selfplay + training + evaluation
 train GAMES="100" PLAYOUTS="20000" EPOCHS="20": install_cargo install_training
     @echo "Running selfplay with {{GAMES}} games, {{PLAYOUTS}} playouts..."
     just selfplay {{GAMES}} {{PLAYOUTS}}
     @echo "Training for {{EPOCHS}} epochs..."
     just train_only {{EPOCHS}}
-    @echo "Training iteration complete!"
+    @echo "Evaluating trained model for promotion..."
+    just promote_model
+    @echo "Training iteration complete and model passed the promotion gate."
 
-# Run iterative training loop (AlphaZero-style with NN-guided selfplay)
-iterate ITERATIONS="20" GAMES="200" PLAYOUTS="1000" EPOCHS="20": install_cargo install_training
-    cd training && uv run iterate.py --iterations {{ITERATIONS}} --games {{GAMES}} --playouts {{PLAYOUTS}} --epochs {{EPOCHS}}
+# Run iterative training loop with promotion gate
+iterate ITERATIONS="20" GAMES="200" PLAYOUTS="1000" EPOCHS="20" EVAL_PAIRS="100" EVAL_PLAYOUTS="400": install_cargo install_training
+    cd training && uv run iterate.py --iterations {{ITERATIONS}} --games {{GAMES}} --playouts {{PLAYOUTS}} --epochs {{EPOCHS}} --eval-pairs {{EVAL_PAIRS}} --eval-playouts {{EVAL_PLAYOUTS}}
 
 # Create blank models for debugging
 blank_models: install_training
     cd training && uv run create_blank_models.py
+
+# Tiny end-to-end ML smoke test
+ml_smoke: install_cargo install_training
+    cd training && uv run generate_selfplay.py --games 1 --playouts 20 --max-samples 1000
+    cd training && uv run train.py --epochs 1 --num-filters 8 --num-blocks 1 --batch-size 16
+    cargo run --release --bin nn_vs_mcts -- 1 20
diff --git a/bots/src/bin/debug_modes.rs b/bots/src/bin/debug_modes.rs
@@ -1,11 +1,11 @@
-//! Debug benchmark: Compare MCTS modes
+//! Debug benchmark: Compare policy-prior sources
 //!
 //! This tests:
-//! 1. Pure MCTS (UCB1) vs PUCT with uniform priors - PUCT should be similar or better
+//! 1. Production baseline vs itself
 //! 2. PUCT with NN priors vs PUCT with uniform priors - shows if NN is helping
 //!
-//! The PUCT mode uses random rollouts for evaluation (same as Pure MCTS),
-//! but uses the PUCT selection formula which outperforms UCB1.
+//! All modes use the same PUCT search and random rollout evaluation. The only
+//! thing that changes is the policy prior source.
 
 use std::sync::Arc;
 
@@ -135,12 +135,11 @@ fn main() {
     println!("Playouts per move: {}", PLAYOUTS);
     println!("Games per comparison: {}", NUM_GAMES);
 
-    // Test 1: Pure MCTS (UCB1) vs PUCT with uniform priors
-    // PUCT should be similar or better than UCB1
+    // Test 1: Baseline implementation paths should be equivalent.
     run_comparison(
-        "Test 1: Pure MCTS (UCB1) vs PUCT (uniform priors)",
-        "Pure",
-        "PUCT",
+        "Test 1: Production baseline vs explicit uniform priors",
+        "Baseline",
+        "Uniform",
         |game, player| MCTSBot::new(game, player),
         |game, player| MCTSBot::with_neural_net(game, player, None),
     );
@@ -161,12 +160,12 @@ fn main() {
                 |game, player| MCTSBot::with_neural_net(game, player, None),
             );
 
-            // Test 3: PUCT with NN priors vs Pure MCTS
+            // Test 3: PUCT with NN priors vs production baseline
             let nn_clone2 = nn.clone();
             run_comparison(
-                "Test 3: PUCT (NN priors) vs Pure MCTS (UCB1)",
+                "Test 3: PUCT (NN priors) vs production baseline",
                 "NN",
-                "Pure",
+                "Baseline",
                 move |game, player| MCTSBot::with_neural_net(game, player, Some(nn_clone2.clone())),
                 |game, player| MCTSBot::new(game, player),
             );
@@ -179,7 +178,7 @@ fn main() {
 
     println!("\n\nInterpretation Guide:");
     println!("=====================");
-    println!("- PUCT should be similar or better than Pure MCTS");
+    println!("- Baseline vs Uniform should look roughly even");
     println!("- If NN >> Uniform: Neural network policy priors are helping");
     println!("- If Uniform >> NN: Neural network priors are hurting (bad training)");
 }