Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 51 additions & 16 deletions Justfile
Original file line number Diff line number Diff line change
Expand Up @@ -70,34 +70,69 @@ install_training:
@command -v uv > /dev/null || (echo "uv not found. Please install from https://docs.astral.sh/uv/" && exit 1)
cd training && uv sync

# Generate selfplay training data (traditional MCTS, high quality for bootstrapping)
selfplay GAMES="100" PLAYOUTS="20000": install_cargo
mkdir -p training/artifacts
cargo run --release -p selfplay -- {{GAMES}} {{PLAYOUTS}} > training/artifacts/training_data.jsonl
@echo "Generated training data with auxiliary targets (ownership, score_diff)"

# Generate selfplay training data using neural network guidance (faster, more games)
selfplay_nn GAMES="200" PLAYOUTS="800": install_cargo
mkdir -p training/artifacts
cargo run --release -p selfplay -- {{GAMES}} {{PLAYOUTS}} --nn >> training/artifacts/training_data.jsonl
@echo "Generated NN-guided training data with auxiliary targets"
# Generate one durable policy-v2 selfplay run with the production-equivalent uniform-prior baseline
selfplay GAMES="100" PLAYOUTS="20000" MAX_SAMPLES="100000": install_cargo install_training
cd training && uv run generate_selfplay.py --games {{GAMES}} --playouts {{PLAYOUTS}} --max-samples {{MAX_SAMPLES}}

# Generate one durable policy-v2 experimental selfplay run using neural network root priors
selfplay_nn GAMES="200" PLAYOUTS="800" MAX_SAMPLES="100000" PRIOR_WEIGHT="0.01": install_cargo install_training
cd training && uv run generate_selfplay.py --games {{GAMES}} --playouts {{PLAYOUTS}} --nn --prior-weight {{PRIOR_WEIGHT}} --max-samples {{MAX_SAMPLES}}

# Rebuild the active training_data.jsonl replay view from uniform teacher runs only
replay MAX_SAMPLES="100000": install_training
cd training && uv run replay.py build --max-samples {{MAX_SAMPLES}}

# Rebuild replay with explicitly selected teachers for experiments
replay_experimental TEACHERS="uniform,nn_root" MAX_SAMPLES="100000": install_training
cd training && uv run replay.py build --max-samples {{MAX_SAMPLES}} --include-teachers {{TEACHERS}}

# Train the model on existing data
train_only EPOCHS="20": install_training
cd training && uv run train.py --epochs {{EPOCHS}}

# Run one iteration: selfplay + training (traditional MCTS for bootstrapping)
# Evaluate trained policy priors against the uniform-prior production baseline
evaluate_model PAIRS="200" PLAYOUTS="400" MIN_SCORE="0.53": install_cargo
cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}} --summary-only --min-score {{MIN_SCORE}}

# Evaluate trained policy priors without enforcing a promotion gate
evaluate_model_report PAIRS="100" PLAYOUTS="400": install_cargo
cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}}

# Sweep root prior blend weights without enforcing a promotion gate
evaluate_prior_sweep PAIRS="200" PLAYOUTS="400" WEIGHTS="0.00,0.02,0.05,0.08,0.10,0.15,0.25": install_cargo install_training
cd training && uv run prior_sweep.py --pairs {{PAIRS}} --playouts {{PLAYOUTS}} --weights {{WEIGHTS}}

# Sanity-check evaluator by comparing uniform priors against uniform priors
evaluate_uniform PAIRS="20" PLAYOUTS="200": install_cargo
cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}} --uniform-vs-uniform

# Promote the current ONNX model to the browser-served artifact path after evaluation passes
promote_model PAIRS="200" PLAYOUTS="400" MIN_SCORE="0.53": install_cargo
cargo run --release --bin nn_vs_mcts -- {{PAIRS}} {{PLAYOUTS}} --summary-only --min-score {{MIN_SCORE}}
mkdir -p www/public/models
cp training/artifacts/model.onnx www/public/models/htmf-policy.onnx
@echo "Promoted training/artifacts/model.onnx to www/public/models/htmf-policy.onnx"

# Run one iteration: selfplay + training + evaluation
train GAMES="100" PLAYOUTS="20000" EPOCHS="20": install_cargo install_training
@echo "Running selfplay with {{GAMES}} games, {{PLAYOUTS}} playouts..."
just selfplay {{GAMES}} {{PLAYOUTS}}
@echo "Training for {{EPOCHS}} epochs..."
just train_only {{EPOCHS}}
@echo "Training iteration complete!"
@echo "Evaluating trained model for promotion..."
just promote_model
@echo "Training iteration complete and model passed the promotion gate."

# Run iterative training loop (AlphaZero-style with NN-guided selfplay)
iterate ITERATIONS="20" GAMES="200" PLAYOUTS="1000" EPOCHS="20": install_cargo install_training
cd training && uv run iterate.py --iterations {{ITERATIONS}} --games {{GAMES}} --playouts {{PLAYOUTS}} --epochs {{EPOCHS}}
# Run iterative training loop with promotion gate
iterate ITERATIONS="20" GAMES="200" PLAYOUTS="1000" EPOCHS="20" EVAL_PAIRS="100" EVAL_PLAYOUTS="400": install_cargo install_training
cd training && uv run iterate.py --iterations {{ITERATIONS}} --games {{GAMES}} --playouts {{PLAYOUTS}} --epochs {{EPOCHS}} --eval-pairs {{EVAL_PAIRS}} --eval-playouts {{EVAL_PLAYOUTS}}

# Create blank models for debugging
blank_models: install_training
cd training && uv run create_blank_models.py

# Tiny end-to-end ML smoke test
ml_smoke: install_cargo install_training
cd training && uv run generate_selfplay.py --games 1 --playouts 20 --max-samples 1000
cd training && uv run train.py --epochs 1 --num-filters 8 --num-blocks 1 --batch-size 16
cargo run --release --bin nn_vs_mcts -- 1 20
25 changes: 12 additions & 13 deletions bots/src/bin/debug_modes.rs
Original file line number Diff line number Diff line change
@@ -1,11 +1,11 @@
//! Debug benchmark: Compare MCTS modes
//! Debug benchmark: Compare policy-prior sources
//!
//! This tests:
//! 1. Pure MCTS (UCB1) vs PUCT with uniform priors - PUCT should be similar or better
//! 1. Production baseline vs itself
//! 2. PUCT with NN priors vs PUCT with uniform priors - shows if NN is helping
//!
//! The PUCT mode uses random rollouts for evaluation (same as Pure MCTS),
//! but uses the PUCT selection formula which outperforms UCB1.
//! All modes use the same PUCT search and random rollout evaluation. The only
//! thing that changes is the policy prior source.

use std::sync::Arc;

Expand Down Expand Up @@ -135,12 +135,11 @@ fn main() {
println!("Playouts per move: {}", PLAYOUTS);
println!("Games per comparison: {}", NUM_GAMES);

// Test 1: Pure MCTS (UCB1) vs PUCT with uniform priors
// PUCT should be similar or better than UCB1
// Test 1: Baseline implementation paths should be equivalent.
run_comparison(
"Test 1: Pure MCTS (UCB1) vs PUCT (uniform priors)",
"Pure",
"PUCT",
"Test 1: Production baseline vs explicit uniform priors",
"Baseline",
"Uniform",
|game, player| MCTSBot::new(game, player),
|game, player| MCTSBot::with_neural_net(game, player, None),
);
Expand All @@ -161,12 +160,12 @@ fn main() {
|game, player| MCTSBot::with_neural_net(game, player, None),
);

// Test 3: PUCT with NN priors vs Pure MCTS
// Test 3: PUCT with NN priors vs production baseline
let nn_clone2 = nn.clone();
run_comparison(
"Test 3: PUCT (NN priors) vs Pure MCTS (UCB1)",
"Test 3: PUCT (NN priors) vs production baseline",
"NN",
"Pure",
"Baseline",
move |game, player| MCTSBot::with_neural_net(game, player, Some(nn_clone2.clone())),
|game, player| MCTSBot::new(game, player),
);
Expand All @@ -179,7 +178,7 @@ fn main() {

println!("\n\nInterpretation Guide:");
println!("=====================");
println!("- PUCT should be similar or better than Pure MCTS");
println!("- Baseline vs Uniform should look roughly even");
println!("- If NN >> Uniform: Neural network policy priors are helping");
println!("- If Uniform >> NN: Neural network priors are hurting (bad training)");
}
Loading
Loading