diff --git a/agents/claude_reprompt/human_readable_trace.py b/agents/claude_reprompt/human_readable_trace.py
new file mode 120000
index 00000000..d643db01
--- /dev/null
+++ b/agents/claude_reprompt/human_readable_trace.py
@@ -0,0 +1 @@
+../claude/human_readable_trace.py
\ No newline at end of file
diff --git a/agents/claude_reprompt/solve.sh b/agents/claude_reprompt/solve.sh
new file mode 100755
index 00000000..b0b25a4c
--- /dev/null
+++ b/agents/claude_reprompt/solve.sh
@@ -0,0 +1,31 @@
+#!/bin/bash
+unset GEMINI_API_KEY
+unset CODEX_API_KEY
+
+export BASH_MAX_TIMEOUT_MS="36000000"
+
+MIN_REMAINING_MINUTES=30
+
+claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \
+    --dangerously-skip-permissions "$PROMPT"
+
+# Re-prompt loop: if the agent finishes early, resume the session
+while true; do
+    TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
+    if echo "$TIMER_OUTPUT" | grep -q "expired"; then
+        break
+    fi
+
+    REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
+    REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
+    TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))
+
+    if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
+        break
+    fi
+
+    CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."
+
+    claude --print --verbose --continue --model "$AGENT_CONFIG" --output-format stream-json \
+        --dangerously-skip-permissions "$CONTINUATION_PROMPT"
+done
diff --git a/agents/codex_xhigh/human_readable_trace.py b/agents/codex_xhigh/human_readable_trace.py
new file mode 120000
index 00000000..9cf1a5d9
--- /dev/null
+++ b/agents/codex_xhigh/human_readable_trace.py
@@ -0,0 +1 @@
+../codex/human_readable_trace.py
\ No newline at end of file
diff --git a/agents/codex_xhigh/solve.sh b/agents/codex_xhigh/solve.sh
new file mode 100755
index 00000000..443f1c5a
--- /dev/null
+++ b/agents/codex_xhigh/solve.sh
@@ -0,0 +1,12 @@
+#!/bin/bash
+unset ANTHROPIC_API_KEY
+unset GEMINI_API_KEY
+
+# Set reasoning effort to xhigh (prepend to ensure precedence)
+file=/home/ben/.codex/config.toml
+tmp="$(mktemp)"
+printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
+[ -f "$file" ] && cat "$file" >> "$tmp"
+mv "$tmp" "$file"
+
+codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"
diff --git a/agents/codex_xhigh_reprompt/human_readable_trace.py b/agents/codex_xhigh_reprompt/human_readable_trace.py
new file mode 120000
index 00000000..9cf1a5d9
--- /dev/null
+++ b/agents/codex_xhigh_reprompt/human_readable_trace.py
@@ -0,0 +1 @@
+../codex/human_readable_trace.py
\ No newline at end of file
diff --git a/agents/codex_xhigh_reprompt/solve.sh b/agents/codex_xhigh_reprompt/solve.sh
new file mode 100755
index 00000000..3afc9730
--- /dev/null
+++ b/agents/codex_xhigh_reprompt/solve.sh
@@ -0,0 +1,34 @@
+#!/bin/bash
+unset ANTHROPIC_API_KEY
+unset GEMINI_API_KEY
+
+# Set reasoning effort to xhigh (prepend to ensure precedence)
+file=/home/ben/.codex/config.toml
+tmp="$(mktemp)"
+printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp"
+[ -f "$file" ] && cat "$file" >> "$tmp"
+mv "$tmp" "$file"
+
+MIN_REMAINING_MINUTES=30
+
+codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT"
+
+# Re-prompt loop: if the agent finishes early, resume the session
+while true; do
+    TIMER_OUTPUT=$(bash timer.sh 2>/dev/null)
+    if echo "$TIMER_OUTPUT" | grep -q "expired"; then
+        break
+    fi
+
+    REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)')
+    REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+')
+    TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS ))
+
+    if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then
+        break
+    fi
+
+    CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance."
+
+    codex --search exec resume --last --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$CONTINUATION_PROMPT"
+done
diff --git a/containers/gpt_5_5.def b/containers/gpt_5_5.def
new file mode 100644
index 00000000..50105446
--- /dev/null
+++ b/containers/gpt_5_5.def
@@ -0,0 +1,78 @@
+Bootstrap: docker
+From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04
+
+%files
+    containers/requirements-direct.txt /opt/requirements-direct.txt
+
+%post
+    chmod 1777 /tmp
+    # Set environment variables
+    export DEBIAN_FRONTEND=noninteractive
+
+    # Update and install system dependencies
+    apt-get update && apt-get install -y \
+        python3.10 \
+        python3-dev \
+        git \
+        wget \
+        curl \
+        build-essential \
+        && rm -rf /var/lib/apt/lists/*
+
+    # Create python3 symlink
+    ln -sf /usr/bin/python3.10 /usr/bin/python3
+    ln -sf /usr/bin/python3.10 /usr/bin/python
+    
+    # Install Node.js (LTS version 22.x) for npm
+    curl -fsSL https://deb.nodesource.com/setup_22.x | bash -
+    apt-get install -y nodejs
+    
+    # Install uv
+    curl -LsSf https://astral.sh/uv/install.sh | sh
+    export PATH="/root/.local/bin:$PATH"
+    
+    uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto
+
+    #  Pinned direct dependencies
+    uv pip install --system --no-cache -r /opt/requirements-direct.txt
+
+    #  flash-attn (needs no-build-isolation)
+    uv pip install --system --no-cache flash-attn==2.8.3 --no-build-isolation
+
+    #  update CLI harnesss to most stable latest versions 
+    # OpenCode doesn't support DeepSeek V4 yet. 
+    npm install -g \
+        @anthropic-ai/claude-code@2.1.116 \
+        @openai/codex@0.124.0 \
+        @google/gemini-cli@0.39.1 \
+        opencode-ai@1.14.20
+
+    # install inspect evals
+    mkdir -p /opt
+    cd /opt
+    git clone https://github.com/UKGovernmentBEIS/inspect_evals.git
+    cd /opt/inspect_evals
+    git checkout 06001a83e6d7c709c2ede0570dce7f1031a0bad8
+    uv pip install --system --no-cache .
+
+    # install inspect ai with debug 
+    mkdir -p /opt
+    cd /opt
+    git clone https://github.com/rank-and-file/inspect_ai_vllm_stdout.git
+    cd inspect_ai_vllm_stdout
+    uv pip install --system --no-cache .
+    
+%environment
+    export PATH="/root/.local/bin:$PATH"
+    export NO_PROXY="localhost,127.0.0.1"
+    export no_proxy="localhost,127.0.0.1"
+
+%runscript
+    exec python3 "$@"
+
+%labels
+    Version v1.0
+    Description Python ML container with CUDA support for transformers and LLM training (using uv) + AI CLI tools
+
+%help
+    Note: Use the --nv flag to enable NVIDIA GPU support when running the container.
diff --git a/dev_utils/extract_traces.py b/dev_utils/extract_traces.py
index 74e2e7c1..ae4affec 100644
--- a/dev_utils/extract_traces.py
+++ b/dev_utils/extract_traces.py
@@ -153,6 +153,11 @@ def main():
         nargs="+",
         help="Input directory names (relative to RESULTS_BASE) to process"
     )
+    parser.add_argument(
+        "--all",
+        action="store_true",
+        help="Copy all runs, not just the latest per task (default: latest only)"
+    )
     args = parser.parse_args()
 
     output_base = Path(OUTPUT_DIR)
@@ -175,8 +180,12 @@ def main():
 
         print(f"\n[{input_dir_name}]")
 
-        # Iterate over only the latest subdirectories (highest ID per prefix)
-        for subdir in sorted(get_latest_subdirs(input_dir)):
+        # Iterate over subdirectories (latest per task by default, all with --all)
+        if args.all:
+            subdirs = sorted(d for d in input_dir.iterdir() if d.is_dir())
+        else:
+            subdirs = sorted(get_latest_subdirs(input_dir))
+        for subdir in subdirs:
             # Determine source file (prefer solve_parsed.txt)
             src_file = subdir / "solve_parsed.txt"
             if not src_file.exists():
@@ -201,6 +210,7 @@ def main():
             copy_other_files(subdir, dest_dir, 'contamination_judgement.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'disallowed_model_judgement.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'error.log', 'judgement.log', api_keys=api_keys)
+            copy_other_files(subdir, dest_dir, 'time_taken.txt', api_keys=api_keys)
             copy_other_files(subdir, dest_dir, 'system_monitor.log', api_keys=api_keys, optional=True)
 
             tag = " [sanitized]" if was_sanitized else ""
diff --git a/dev_utils/limit_hit_list.py b/dev_utils/limit_hit_list.py
index 12f58093..9bfa144d 100644
--- a/dev_utils/limit_hit_list.py
+++ b/dev_utils/limit_hit_list.py
@@ -10,11 +10,13 @@
     "You've hit your limit",         # Claude Code Pro subscription limit
     "spending_limit",                 # Anthropic/OpenAI spending limit
     "billing_hard_limit",            # OpenAI billing hard limit
-    "insufficient_quota",            # OpenAI quota exceeded
+    "insufficient_quota",            # OpenAI quota exceeded (structured error code)
+    "Quota exceeded. Check your plan",  # OpenAI/Codex quota exceeded (turn.failed message)
     "budget_exceeded",               # General budget error
     "plan does not yet include",     # Z.AI subscription plan restriction
     "token_expired",                 # OpenAI/Codex expired auth token
     "Failed to refresh token",       # Codex CLI refresh token failure
+    "Reconnecting... 5/5",           # Codex CLI exhausted stream-reconnect retries
 ]
 
 
diff --git a/dev_utils/terminated_finder.py b/dev_utils/terminated_finder.py
index f7af3781..90e21feb 100644
--- a/dev_utils/terminated_finder.py
+++ b/dev_utils/terminated_finder.py
@@ -11,19 +11,23 @@ def get_results_dir():
     return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
 
 
+KILLED_RE = re.compile(rb"run_task\.sh: line \d+: \d+ Killed")
+
+
 def classify_error(error_log_path: Path) -> str | None:
     """Classify the error in error.log. Returns 'terminated', 'killed', or None."""
     if not error_log_path.exists():
         return None
     try:
-        content = error_log_path.read_text()
-        if content.startswith("Terminated"):
-            return "terminated"
-        if re.search(r"\bKilled\b", content):
-            return "killed"
-        return None
+        with open(error_log_path, "rb") as f:
+            head = f.read(4096)
     except Exception:
         return None
+    if head.startswith(b"Terminated"):
+        return "terminated"
+    if KILLED_RE.search(head):
+        return "killed"
+    return None
 
 
 def get_latest_runs(method_path: Path):
diff --git a/scripts/README.md b/scripts/README.md
new file mode 100644
index 00000000..b6197f59
--- /dev/null
+++ b/scripts/README.md
@@ -0,0 +1,149 @@
+# scripts
+
+Post-hoc analysis utilities for PostTrainBench result directories. Most scripts
+here read the contents of `$POST_TRAIN_BENCH_RESULTS_DIR` and produce CSV /
+JSON aggregates; the exception is `rerun_eval_n_times.sh`, which actually
+re-runs the model on a GPU.
+
+## Aggregating results into CSVs
+
+The pipeline is two scripts: `collect.py` reads raw run dirs into per-method
+CSVs, then `aggregate.py` rolls those into per-agent avg/std and the weighted
+leaderboard metric.
+
+### Typical flow
+
+From the repo root, with `POST_TRAIN_BENCH_RESULTS_DIR` pointing at the raw
+results tree:
+
+```bash
+# 1. Collect raw per-run data into per-method CSVs.
+#    Reads metrics.json + contamination/disallowed_model judgements + time_taken.txt,
+#    applies baseline-zeroshot fallback for contaminated/errored cells.
+#    Writes:
+#      final_{method}.csv          — score grid (model x benchmark) with fallback
+#      contamination_{method}.csv  — flags ("", "C", "M", "MC", or error string)
+#      time_overview.csv           — average wall time per method
+python scripts/collect.py
+
+# 2. Aggregate across runs/agents and compute the weighted leaderboard metric.
+#    Reads final_{method}.csv produced above. Writes:
+#      aggregated_avg_{agent}.csv  — per-cell mean across runs (one per multi-run agent)
+#      aggregated_std_{agent}.csv  — per-cell sample stddev (n-1)
+#      single_metrics.csv          — weighted score per individual run
+#      single_metrics_aggregated.csv  — agent-level avg/std/n on the weighted metric
+#      time_aggregated.csv         — agent-level avg/std wall time
+python scripts/aggregate.py
+```
+
+`aggregate.py` skips agents whose run CSVs aren't present in this results
+dir, so it's safe to run against a partial tree.
+
+### `collect.py` flags
+
+```bash
+python scripts/collect.py \
+    --data-dir /path/to/results \      # default: $POST_TRAIN_BENCH_RESULTS_DIR
+    --output-dir /path/to/out \        # default: same as --data-dir
+    --min-run-id 17000000 \            # inclusive lower bound on cluster_id
+    --max-run-id 17200000              # exclusive upper bound on cluster_id
+```
+
+### `aggregate.py` flags
+
+By default `--all` is implied (write everything). Use the flags below to
+restrict to one stage:
+
+```bash
+python scripts/aggregate.py --per-cell      # only aggregated_avg/std_{agent}.csv
+python scripts/aggregate.py --leaderboard   # only single_metrics{,_aggregated}.csv
+python scripts/aggregate.py --time          # only time_aggregated.csv
+```
+
+Same `--data-dir` / `--output-dir` flags as `collect.py`.
+
+### Hardcoded things
+
+| File | What it pins |
+|---|---|
+| `constants.py` (`HARDCODED_AGENT_MAP`) | Which run directories belong to which agent (multi-run agents are how stddev is computed) |
+| `constants.py` (`HARDCODED_BENCHMARKS`, `EXPECTED_MODELS`) | Benchmark + base-model lists |
+| `factors.json` | Per-benchmark weights for the weighted leaderboard metric |
+| `baselines.json` | Hardcoded zero-shot + few-shot baseline scores; used as fallback for contaminated/errored cells (no longer recomputed at every run) |
+
+To add a new agent: add its run-dir names to `HARDCODED_AGENT_MAP` in
+`constants.py`. To add a new benchmark: extend `HARDCODED_BENCHMARKS` and add
+a weight to `factors.json`.
+
+### `verify.py` (refactor regression check)
+
+`verify.py` is a one-off script used when the new pipeline was
+rolled out — it compares two CSV output dirs cell-by-cell with float
+tolerance, used to confirm the new pipeline matches the old one byte-for-byte
+(except for filename renames). Not part of the normal workflow.
+
+```bash
+python scripts/verify.py \
+    --ground-truth /fast/.../ptb_results_old \
+    --new-output   /fast/.../ptb_results_new
+```
+
+## Other helpers
+
+| Script | Description |
+|---|---|
+| `compute_claude_costs.py` | Claude API spend rollup |
+| `extract_token_usage.py` | Token-usage extraction from agent traces |
+| `migrate_judgement_files.py` | One-off: migrate older judgement file naming |
+| `list_safetensors.py` | List safetensors files under a result tree |
+| `parse_all_to_human_readable.sh` | Run human-readable trace parsers across results |
+| `baselines.json`, `factors.json`, `constants.py`, `utils.py` | Shared config / helpers |
+
+## Re-evaluating a finished run N times
+
+`rerun_eval_n_times.sh` re-evaluates a job's `final_model/` N times and writes
+mean / std / stderr / min / max per metric into `metrics_averaged.json`. Useful
+because each job's standard `metrics.json` is a single decoding sample per
+question and does not capture decoding noise.
+
+It mirrors `src/run_task.sh`'s evaluation step exactly:
+
+- runs `src/eval/tasks/<task>/evaluate.py` (the live source — **not** the
+  potentially-modified snapshot in `<EVAL_DIR>/task/`)
+- inside the same `${POST_TRAIN_BENCH_CONTAINER_NAME}.sif` container
+- with the same fuse-overlayfs HF cache pattern (`with_huggingface_overlay`)
+- using the same `--max-tokens` fallback ladder per task
+
+Per-run JSONs are written to `<EVAL_DIR>/reruns/run_{i}.json` (with
+`run_{i}_{level}.log` alongside). The aggregated file is `<EVAL_DIR>/metrics_averaged.json`.
+
+### Files
+
+| File | Description |
+|---|---|
+| `rerun_eval_n_times.sh` | Driver: re-runs `evaluate.py` N times on one EVAL_DIR and aggregates |
+| `aggregate_metrics_runs.py` | Helper called by the driver: computes mean/std/stderr/min/max from per-run JSONs |
+| `../src/commit_utils/rerun_eval.sub` | HTCondor submission file |
+
+### Usage
+
+#### Locally on a GPU node
+
+From the repo root:
+
+```bash
+scripts/rerun_eval_n_times.sh /path/to/EVAL_DIR 5
+```
+
+`EVAL_DIR` must be an existing job directory containing `final_model/`. The
+task name is parsed from the basename (`<task>_<model_safe>_<cluster_id>`) to
+pick the correct max-tokens fallback ladder.
+
+#### HTCondor 
+
+```bash
+condor_submit_bid 50 \
+  -a "eval_dir=/path/to/EVAL_DIR" \
+  -a "n=5" \
+  src/commit_utils/rerun_eval.sub
+```
diff --git a/scripts/aggregate.py b/scripts/aggregate.py
new file mode 100644
index 00000000..4ecb2457
--- /dev/null
+++ b/scripts/aggregate.py
@@ -0,0 +1,347 @@
+#!/usr/bin/env python3
+"""
+Aggregate results across multiple runs per agent.
+
+Reads final_{method}.csv files produced by collect.py and computes:
+  --per-cell     : aggregated_avg_{agent}.csv, aggregated_std_{agent}.csv
+  --leaderboard  : single_metrics.csv, single_metrics_aggregated.csv
+  --time         : time_aggregated.csv
+  --all          : everything (default)
+
+Usage:
+    python aggregate.py
+    python aggregate.py --data-dir /path/to/results --output-dir /path/to/output
+    python aggregate.py --per-cell --leaderboard
+"""
+import argparse
+import csv
+import os
+import re
+
+from utils import (
+    get_results_dir,
+    load_csv_as_dict,
+    write_csv,
+    load_factors,
+    mean,
+    stddev,
+    is_number,
+    format_time_hms,
+    HARDCODED_AGENT_MAP,
+    HARDCODED_BENCHMARKS,
+    EXPECTED_MODELS,
+)
+
+
+# ---------------------------------------------------------------------------
+# Per-cell avg/std across runs
+# ---------------------------------------------------------------------------
+
+def aggregate_per_cell(
+    agent_name: str,
+    method_names: list[str],
+    data_dir: str,
+    output_dir: str,
+):
+    """
+    For each (model, benchmark) cell, compute mean and sample stddev
+    across the runs. Write aggregated_avg_{agent}.csv and aggregated_std_{agent}.csv.
+    """
+    all_data = []
+    all_models = None
+
+    for method_name in method_names:
+        csv_path = os.path.join(data_dir, f"final_{method_name}.csv")
+        data, _ = load_csv_as_dict(csv_path)
+
+        models = sorted(data.keys())
+        if all_models is None:
+            all_models = models
+        else:
+            assert all_models == models, (
+                f"Model mismatch for {method_name}: "
+                f"expected {all_models}, got {models}"
+            )
+        all_data.append(data)
+
+    avg_data = {}
+    std_data = {}
+
+    for model in all_models:
+        avg_data[model] = {}
+        std_data[model] = {}
+
+        for bench in HARDCODED_BENCHMARKS:
+            values = []
+            for data in all_data:
+                values.append(float(data[model][bench]))
+
+            avg_data[model][bench] = str(mean(values))
+            std_data[model][bench] = str(stddev(values))
+
+    avg_path = os.path.join(output_dir, f"aggregated_avg_{agent_name}.csv")
+    write_csv(avg_path, all_models, HARDCODED_BENCHMARKS, avg_data)
+    print(f"Written: {avg_path}")
+
+    std_path = os.path.join(output_dir, f"aggregated_std_{agent_name}.csv")
+    write_csv(std_path, all_models, HARDCODED_BENCHMARKS, std_data)
+    print(f"Written: {std_path}")
+
+    return avg_data, std_data
+
+
+# ---------------------------------------------------------------------------
+# Weighted single metric
+# ---------------------------------------------------------------------------
+
+def compute_weighted_metric(
+    data: dict[str, dict[str, str]],
+    factors: dict[str, float],
+) -> float:
+    """
+    Compute weighted sum: for each benchmark, average across models,
+    multiply by factor, sum.
+    """
+    valid_benchmarks = set(factors.keys())
+    total = 0.0
+    num_models = len(data)
+    for bench in sorted(valid_benchmarks):
+        values = [float(data[model][bench]) for model in data]
+        avg_value = sum(values) / num_models
+        total += avg_value * factors[bench]
+    return total
+
+
+def aggregate_leaderboard(data_dir: str, output_dir: str):
+    """
+    Compute weighted metric for every final_*.csv that has all expected models.
+    Then group by HARDCODED_AGENT_MAP for avg/std.
+
+    Also writes final_avg_{agent}.csv and final_std_{agent}.csv (identical to
+    aggregated_ versions) so their metrics appear in single_metrics.csv.
+    """
+    factors = load_factors()
+    valid_benchmarks = set(factors.keys())
+
+    # Phase 1: compute per-cell avg/std and write final_avg/std files
+    # so they get picked up in the metric scan below
+    for agent_name, method_names in HARDCODED_AGENT_MAP.items():
+        avg_data, std_data = _load_avg_std_for_agent(
+            agent_name, method_names, data_dir
+        )
+        if avg_data is not None:
+            # Write final_avg_{agent}.csv (identical to aggregated_avg_)
+            avg_path = os.path.join(output_dir, f"final_avg_{agent_name}.csv")
+            write_csv(
+                avg_path,
+                sorted(avg_data.keys()),
+                HARDCODED_BENCHMARKS,
+                avg_data,
+            )
+            std_path = os.path.join(output_dir, f"final_std_{agent_name}.csv")
+            write_csv(
+                std_path,
+                sorted(std_data.keys()),
+                HARDCODED_BENCHMARKS,
+                std_data,
+            )
+
+    # Phase 2: compute metrics for ALL final_*.csv files in the output dir
+    all_metrics = {}
+
+    for filename in os.listdir(output_dir):
+        if not filename.startswith("final_"):
+            continue
+        if not filename.endswith(".csv"):
+            continue
+        if filename.startswith("final_time_"):
+            continue
+
+        csv_path = os.path.join(output_dir, filename)
+        try:
+            data, _ = load_csv_as_dict(csv_path)
+        except Exception:
+            print(f"Warning: could not load {csv_path}.")
+            raise
+
+        if set(data.keys()) != EXPECTED_MODELS:
+            continue
+
+        method_name = filename[len("final_"):-len(".csv")]
+        all_metrics[method_name] = compute_weighted_metric(data, factors)
+
+    # Write individual metrics
+    metrics_path = os.path.join(output_dir, "single_metrics.csv")
+    with open(metrics_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["method", "metric"])
+        for method_name in sorted(all_metrics.keys()):
+            writer.writerow([method_name, all_metrics[method_name]])
+    print(f"Written: {metrics_path}")
+
+    # Compute aggregated metrics per agent group
+    aggregated_path = os.path.join(output_dir, "single_metrics_aggregated.csv")
+    with open(aggregated_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["agent", "avg", "std", "n"])
+        for agent_name in sorted(HARDCODED_AGENT_MAP.keys()):
+            method_names = HARDCODED_AGENT_MAP[agent_name]
+            # Skip agents with missing runs
+            if not all(m in all_metrics for m in method_names):
+                print(f"Skipping agent {agent_name} in leaderboard: missing metrics")
+                continue
+            metrics = [all_metrics[m] for m in method_names]
+            writer.writerow([
+                agent_name,
+                mean(metrics),
+                stddev(metrics),
+                len(metrics),
+            ])
+    print(f"Written: {aggregated_path}")
+
+
+def _load_avg_std_for_agent(
+    agent_name: str,
+    method_names: list[str],
+    data_dir: str,
+) -> tuple[dict | None, dict | None]:
+    """Load final_*.csv for each run and compute per-cell avg/std."""
+    all_data = []
+    all_models = None
+
+    for method_name in method_names:
+        csv_path = os.path.join(data_dir, f"final_{method_name}.csv")
+        if not os.path.exists(csv_path):
+            return None, None
+        data, _ = load_csv_as_dict(csv_path)
+        models = sorted(data.keys())
+        if all_models is None:
+            all_models = models
+        all_data.append(data)
+
+    avg_data = {}
+    std_data = {}
+    for model in all_models:
+        avg_data[model] = {}
+        std_data[model] = {}
+        for bench in HARDCODED_BENCHMARKS:
+            values = [float(d[model][bench]) for d in all_data]
+            avg_data[model][bench] = str(mean(values))
+            std_data[model][bench] = str(stddev(values))
+
+    return avg_data, std_data
+
+
+# ---------------------------------------------------------------------------
+# Time aggregation
+# ---------------------------------------------------------------------------
+
+def parse_time_to_hours(time_str: str) -> float:
+    """Parse time string like '8:17:28' to hours as float."""
+    parts = time_str.split(":")
+    hours = int(parts[0])
+    minutes = int(parts[1])
+    seconds = int(parts[2])
+    return hours + minutes / 60 + seconds / 3600
+
+
+def aggregate_time(data_dir: str, output_dir: str):
+    """
+    Read time_overview.csv, group by HARDCODED_AGENT_MAP, compute avg/std.
+    Write time_aggregated.csv.
+    """
+    time_csv_path = os.path.join(data_dir, "time_overview.csv")
+
+    time_data = {}
+    with open(time_csv_path, "r", newline="") as f:
+        reader = csv.DictReader(f)
+        for row in reader:
+            method = row["method"]
+            avg_time = row["average_time"]
+            if avg_time and avg_time != "N/A":
+                time_data[method] = parse_time_to_hours(avg_time)
+
+    output_path = os.path.join(output_dir, "time_aggregated.csv")
+    with open(output_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["agent", "avg_time", "std_time", "n"])
+        for agent_name, method_names in HARDCODED_AGENT_MAP.items():
+            if not all(m in time_data for m in method_names):
+                print(f"Skipping agent {agent_name} in time: missing data")
+                continue
+            hours_list = [time_data[m] for m in method_names]
+            writer.writerow([
+                agent_name,
+                format_time_hms(int(mean(hours_list) * 3600)),
+                format_time_hms(int(stddev(hours_list) * 3600)),
+                len(hours_list),
+            ])
+    print(f"Written: {output_path}")
+
+
+def _all_finals_exist(method_names: list[str], data_dir: str) -> bool:
+    """Check if all final_*.csv files exist for the given methods."""
+    return all(
+        os.path.exists(os.path.join(data_dir, f"final_{m}.csv"))
+        for m in method_names
+    )
+
+
+# ---------------------------------------------------------------------------
+# CLI
+# ---------------------------------------------------------------------------
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Aggregate results across multiple runs per agent."
+    )
+    parser.add_argument(
+        "--data-dir",
+        default=None,
+        help="Directory containing final_*.csv files (from collect.py). "
+        "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory to write output CSVs. Defaults to same as --data-dir.",
+    )
+    parser.add_argument("--per-cell", action="store_true",
+                        help="Write per-cell avg/std CSVs per agent.")
+    parser.add_argument("--leaderboard", action="store_true",
+                        help="Write single_metrics.csv and single_metrics_aggregated.csv.")
+    parser.add_argument("--time", action="store_true",
+                        help="Write time_aggregated.csv.")
+    parser.add_argument("--all", action="store_true",
+                        help="Write everything (default if no flags given).")
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    data_dir = args.data_dir or get_results_dir()
+    output_dir = args.output_dir or data_dir
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    do_all = args.all or not (args.per_cell or args.leaderboard or args.time)
+
+    if do_all or args.per_cell:
+        for agent_name, method_names in HARDCODED_AGENT_MAP.items():
+            # Skip agents whose run data isn't available
+            if not _all_finals_exist(method_names, data_dir):
+                print(f"Skipping agent {agent_name}: missing final CSVs")
+                continue
+            print(f"Processing agent: {agent_name}")
+            aggregate_per_cell(agent_name, method_names, data_dir, output_dir)
+
+    if do_all or args.leaderboard:
+        aggregate_leaderboard(data_dir, output_dir)
+
+    if do_all or args.time:
+        aggregate_time(data_dir, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/aggregate.sh b/scripts/aggregate.sh
deleted file mode 100644
index a3a41e18..00000000
--- a/scripts/aggregate.sh
+++ /dev/null
@@ -1,54 +0,0 @@
-#!/bin/bash
-source src/commit_utils/set_env_vars.sh
-
-echo "==============================="
-echo "Aggregating method results..."
-python scripts/aggregate_methods.py
-echo "==============================="
-echo "Aggregating time results..."
-python scripts/aggregate_time_baselines.py
-echo "==============================="
-echo "Aggregating contamination results..."
-python scripts/aggregate_contamination.py
-
-python scripts/aggregate_time.py
-sleep 1
-python scripts/aggregate_final.py
-sleep 1
-python scripts/aggregate_summary.py \
-    claude_claude-opus-4-6_10h_run1_old_container \
-    claude_claude-opus-4-6_10h_run2 \
-    claude_claude-opus-4-6_10h_run3 \
-    codex_non_api_gpt-5.3-codex_10h_run1 \
-    codex_non_api_gpt-5.3-codex_10h_run2 \
-    codex_non_api_gpt-5.3-codex_10h_run3 \
-    opencode_opencode_glm-5_10h_run2 \
-    opencode_opencode_kimi-k2.5_10h_run2 \
-    opencode_opencode_minimax-m2.5-free_10h_run2 \
-    opencode_zai_glm-5_10h_run2 \
-    codex_non_api_high_gpt-5.3-codex_10h_run1 \
-    codex_non_api_high_gpt-5.3-codex_10h_run2 \
-    codex_non_api_high_gpt-5.3-codex_10h_run3 \
-    codex_non_api_high_gpt-5.4_10h_run1 \
-    codex_non_api_high_gpt-5.4_10h_run2 \
-    codex_non_api_high_gpt-5.4_10h_run3 \
-    claude_non_api_claude-opus-4-6_1m__10h_run1 \
-    claude_non_api_claude-opus-4-6_1m__10h_run2 \
-    claude_non_api_claude-opus-4-6_1m__10h_run3
-    # opencode_anthropic_claude-opus-4-5_10h \
-    # opencode_opencode_big-pickle_10h \
-    # opencode_opencode_gemini-3-pro_10h \
-    # opencode_opencode_glm-4.7-free_10h \
-    # opencode_opencode_gpt-5.1-codex-max_10h \
-    # opencode_opencode_kimi-k2-thinking_10h \
-    # opencode_opencode_minimax-m2.1-free_10h \
-    # qwen3max_qwen3-max-2026-01-23_10h
-
-# python scripts/aggregate_together.py \
-#     opencode_anthropic_claude-opus-4-5_10h \
-#     opencode_opencode_big-pickle_10h \
-#     opencode_opencode_gemini-3-pro_10h \
-#     opencode_opencode_glm-4.7-free_10h \
-#     opencode_opencode_gpt-5.1-codex-max_10h \
-#     opencode_opencode_kimi-k2-thinking_10h \
-#     opencode_opencode_minimax-m2.1-free_10h
\ No newline at end of file
diff --git a/scripts/aggregate_avg_stddev.py b/scripts/aggregate_avg_stddev.py
deleted file mode 100755
index b962c792..00000000
--- a/scripts/aggregate_avg_stddev.py
+++ /dev/null
@@ -1,131 +0,0 @@
-#!/usr/bin/env python3
-"""
-Aggregate results across multiple runs for each agent.
-
-Takes the outputs of aggregate_final.py (final_*.csv files) and combines
-them into average and standard deviation CSVs for each agent group.
-"""
-import os
-import csv
-import math
-
-from constants import HARDCODED_AGENT_MAP, HARDCODED_BENCHMARKS
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def mean(values: list[float]) -> float:
-    return sum(values) / len(values)
-
-
-def stddev(values: list[float]) -> float:
-    avg = mean(values)
-    variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1)
-    return math.sqrt(variance)
-
-
-def load_csv_as_dict(csv_path: str) -> tuple[dict[str, dict[str, str]], list[str]]:
-    """
-    Load a CSV file into a dict of dicts: {model: {benchmark: value}}.
-    Returns (data_dict, list_of_benchmarks).
-    """
-    data = {}
-    benchmarks = []
-
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader)
-
-        # First column is "model", rest are benchmarks
-        benchmarks = header[1:]
-
-        for row in reader:
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(benchmarks):
-                data[model][bench] = row[i + 1]
-
-    return data, benchmarks
-
-
-def aggregate_runs(agent_name: str, method_names: list[str], results_dir: str):
-    """
-    Aggregate results from multiple method runs into average and std CSV files.
-    """
-    # Load all method data
-    all_data = []
-    all_models = None
-
-    for method_name in method_names:
-        csv_path = os.path.join(results_dir, f"final_{method_name}.csv")
-        data, _ = load_csv_as_dict(csv_path)
-
-        models = sorted(data.keys())
-        if all_models is None:
-            all_models = models
-        else:
-            assert all_models == models, (
-                f"Model mismatch for {method_name}: "
-                f"expected {all_models}, got {models}"
-            )
-
-        all_data.append(data)
-
-    # Compute average and std for each (model, benchmark) cell
-    avg_data = {}
-    std_data = {}
-
-    for model in all_models:
-        avg_data[model] = {}
-        std_data[model] = {}
-
-        for bench in HARDCODED_BENCHMARKS:
-            values = []
-            for data in all_data:
-                value_str = data[model][bench]
-                value = float(value_str)
-                values.append(value)
-
-            avg_data[model][bench] = str(mean(values))
-            std_data[model][bench] = str(stddev(values))
-
-    # Write average CSV
-    avg_path = os.path.join(results_dir, f"aggregated_avg_{agent_name}.csv")
-    write_csv(avg_path, all_models, HARDCODED_BENCHMARKS, avg_data)
-    print(f"Written: {avg_path}")
-
-    # Write std CSV
-    std_path = os.path.join(results_dir, f"aggregated_std_{agent_name}.csv")
-    write_csv(std_path, all_models, HARDCODED_BENCHMARKS, std_data)
-    print(f"Written: {std_path}")
-
-
-def write_csv(
-    path: str,
-    models: list[str],
-    benchmarks: list[str],
-    data: dict[str, dict[str, str]],
-):
-    """Write data dict to CSV file."""
-    with open(path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["model"] + benchmarks)
-
-        for model in models:
-            row = [model]
-            for bench in benchmarks:
-                row.append(data[model][bench])
-            writer.writerow(row)
-
-
-def main():
-    results_dir = get_results_dir()
-
-    for agent_name, method_names in HARDCODED_AGENT_MAP.items():
-        print(f"Processing agent: {agent_name}")
-        aggregate_runs(agent_name, method_names, results_dir)
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/aggregate_avg_stddev_over_benchmarks.py b/scripts/aggregate_avg_stddev_over_benchmarks.py
deleted file mode 100755
index 71c65e39..00000000
--- a/scripts/aggregate_avg_stddev_over_benchmarks.py
+++ /dev/null
@@ -1,199 +0,0 @@
-#!/usr/bin/env python3
-"""
-Aggregate results across multiple runs for each agent, averaging over models.
-
-Takes the outputs of aggregate_final.py (final_*.csv files), averages values
-over models for each benchmark, then computes average and standard deviation
-across runs for each agent.
-
-Produces two CSV files: one for averages, one for standard deviations.
-Rows are benchmarks, columns are agents.
-"""
-import os
-import csv
-import math
-
-from constants import HARDCODED_AGENT_MAP, HARDCODED_BENCHMARKS
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def mean(values: list[float]) -> float:
-    return sum(values) / len(values)
-
-
-def stddev(values: list[float]) -> float:
-    avg = mean(values)
-    variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1)
-    return math.sqrt(variance)
-
-
-def load_csv_as_dict(csv_path: str) -> dict[str, dict[str, str]]:
-    """
-    Load a CSV file into a dict of dicts: {model: {benchmark: value}}.
-    """
-    data = {}
-
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader)
-
-        # First column is "model", rest are benchmarks
-        benchmarks = header[1:]
-
-        for row in reader:
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(benchmarks):
-                data[model][bench] = row[i + 1]
-
-    return data
-
-
-def compute_model_average(data: dict[str, dict[str, str]], bench: str) -> float:
-    """Compute average value over all models for a given benchmark."""
-    values = []
-    for model in data:
-        value = float(data[model][bench])
-        values.append(value)
-    return mean(values)
-
-
-def aggregate_agent(method_names: list[str], results_dir: str):
-    """
-    Aggregate results from multiple method runs for one agent.
-    Returns (avg_per_benchmark, std_per_benchmark, avg_per_model_benchmark, std_per_model_benchmark, models).
-
-    avg_per_benchmark[bench] = avg value (averaged over models)
-    avg_per_model_benchmark[model][bench] = avg value (per model)
-    """
-    # For each run, compute model-averaged value per benchmark
-    # run_averages[benchmark] = [avg_run1, avg_run2, ...]
-    run_averages = {bench: [] for bench in HARDCODED_BENCHMARKS}
-
-    # For each run, also store per-model values
-    # run_values_per_model[model][benchmark] = [val_run1, val_run2, ...]
-    run_values_per_model = {}
-    all_models = None
-
-    for method_name in method_names:
-        csv_path = os.path.join(results_dir, f"final_{method_name}.csv")
-        data = load_csv_as_dict(csv_path)
-
-        models = sorted(data.keys())
-        if all_models is None:
-            all_models = models
-            for model in models:
-                run_values_per_model[model] = {bench: [] for bench in HARDCODED_BENCHMARKS}
-
-        for bench in HARDCODED_BENCHMARKS:
-            model_avg = compute_model_average(data, bench)
-            run_averages[bench].append(model_avg)
-
-            for model in all_models:
-                value = float(data[model][bench])
-                run_values_per_model[model][bench].append(value)
-
-    # Compute avg and std across runs for each benchmark (averaged over models)
-    avg_per_benchmark = {}
-    std_per_benchmark = {}
-
-    for bench in HARDCODED_BENCHMARKS:
-        values = run_averages[bench]
-        avg_per_benchmark[bench] = mean(values)
-        std_per_benchmark[bench] = stddev(values)
-
-    # Compute avg and std across runs for each (model, benchmark) pair
-    avg_per_model_benchmark = {}
-    std_per_model_benchmark = {}
-
-    for model in all_models:
-        avg_per_model_benchmark[model] = {}
-        std_per_model_benchmark[model] = {}
-        for bench in HARDCODED_BENCHMARKS:
-            values = run_values_per_model[model][bench]
-            avg_per_model_benchmark[model][bench] = mean(values)
-            std_per_model_benchmark[model][bench] = stddev(values)
-
-    return avg_per_benchmark, std_per_benchmark, avg_per_model_benchmark, std_per_model_benchmark, all_models
-
-
-def main():
-    results_dir = get_results_dir()
-
-    # Collect results for all agents
-    # all_avg[benchmark][agent] = avg_value
-    # all_std[benchmark][agent] = std_value
-    all_avg = {bench: {} for bench in HARDCODED_BENCHMARKS}
-    all_std = {bench: {} for bench in HARDCODED_BENCHMARKS}
-
-    # Per-model results: all_avg_per_model[model][benchmark][agent] = avg_value
-    all_avg_per_model = {}
-    all_std_per_model = {}
-
-    agent_names = list(HARDCODED_AGENT_MAP.keys())
-    all_models = None
-
-    for agent_name, method_names in HARDCODED_AGENT_MAP.items():
-        print(f"Processing agent: {agent_name}")
-        avg_per_benchmark, std_per_benchmark, avg_per_model, std_per_model, models = aggregate_agent(method_names, results_dir)
-
-        if all_models is None:
-            all_models = models
-            for model in models:
-                all_avg_per_model[model] = {bench: {} for bench in HARDCODED_BENCHMARKS}
-                all_std_per_model[model] = {bench: {} for bench in HARDCODED_BENCHMARKS}
-
-        for bench in HARDCODED_BENCHMARKS:
-            all_avg[bench][agent_name] = avg_per_benchmark[bench]
-            all_std[bench][agent_name] = std_per_benchmark[bench]
-
-            for model in all_models:
-                all_avg_per_model[model][bench][agent_name] = avg_per_model[model][bench]
-                all_std_per_model[model][bench][agent_name] = std_per_model[model][bench]
-
-    # Write average CSV (over models)
-    avg_path = os.path.join(results_dir, "aggregated_avg_over_models.csv")
-    with open(avg_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["benchmark"] + agent_names)
-        for bench in HARDCODED_BENCHMARKS:
-            row = [bench] + [all_avg[bench][agent] for agent in agent_names]
-            writer.writerow(row)
-    print(f"Written: {avg_path}")
-
-    # Write std CSV (over models)
-    std_path = os.path.join(results_dir, "aggregated_std_over_models.csv")
-    with open(std_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["benchmark"] + agent_names)
-        for bench in HARDCODED_BENCHMARKS:
-            row = [bench] + [all_std[bench][agent] for agent in agent_names]
-            writer.writerow(row)
-    print(f"Written: {std_path}")
-
-    # Write per-model CSV files
-    for model in all_models:
-        avg_path = os.path.join(results_dir, f"aggregated_avg_{model}.csv")
-        with open(avg_path, "w", newline="") as f:
-            writer = csv.writer(f)
-            writer.writerow(["benchmark"] + agent_names)
-            for bench in HARDCODED_BENCHMARKS:
-                row = [bench] + [all_avg_per_model[model][bench][agent] for agent in agent_names]
-                writer.writerow(row)
-        print(f"Written: {avg_path}")
-
-        std_path = os.path.join(results_dir, f"aggregated_std_{model}.csv")
-        with open(std_path, "w", newline="") as f:
-            writer = csv.writer(f)
-            writer.writerow(["benchmark"] + agent_names)
-            for bench in HARDCODED_BENCHMARKS:
-                row = [bench] + [all_std_per_model[model][bench][agent] for agent in agent_names]
-                writer.writerow(row)
-        print(f"Written: {std_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/aggregate_contamination.py b/scripts/aggregate_contamination.py
deleted file mode 100644
index 01a4fabd..00000000
--- a/scripts/aggregate_contamination.py
+++ /dev/null
@@ -1,154 +0,0 @@
-#!/usr/bin/env python3
-import os
-import csv
-
-OUTPUT_PREFIX = "contamination_"        # e.g. "contamination_method.csv"
-
-
-def load_contamination(contamination_path: str):
-    """
-    Return True, False, "IMPORTANT ERR", or "ERR" based on contamination judgement.
-    """
-    if not os.path.exists(contamination_path):
-        return "ERR"
-    try:
-        with open(contamination_path, "r") as f:
-            content = f.read().strip()
-    except Exception:
-        return "ERR"
-    if content == "contamination detected":
-        return True
-    elif content == "no contamination detected":
-        return False
-    else:
-        return "IMPORTANT ERR"
-
-
-def load_disallowed_model(disallowed_path: str):
-    """
-    Return True, False, "IMPORTANT ERR", or "ERR" based on disallowed model judgement.
-    """
-    if not os.path.exists(disallowed_path):
-        return "ERR"
-    try:
-        with open(disallowed_path, "r") as f:
-            content = f.read().strip()
-    except Exception:
-        return "ERR"
-    if content == "disallowed use detected":
-        return True
-    elif content == "only allowed use detected":
-        return False
-    else:
-        return "IMPORTANT ERR"
-
-
-def combine_results(contamination, disallowed_model):
-    """
-    Combine contamination and disallowed model results into a single cell value.
-    
-    Returns:
-        - "" if no illegal use detected (and no contamination)
-        - "M" if disallowed model detected (but no contamination)
-        - "MC" if disallowed model detected & contamination
-        - "C" if only contamination detected (and no disallowed model)
-        - Error string if either result is an error
-    """
-    # Handle error cases first
-    if contamination in ("ERR", "IMPORTANT ERR") or disallowed_model in ("ERR", "IMPORTANT ERR"):
-        errors = []
-        if contamination in ("ERR", "IMPORTANT ERR"):
-            errors.append(f"C:{contamination}")
-        if disallowed_model in ("ERR", "IMPORTANT ERR"):
-            errors.append(f"M:{disallowed_model}")
-        return " ".join(errors)
-    
-    # Both are boolean now
-    if disallowed_model and contamination:
-        return "MC"
-    elif disallowed_model and not contamination:
-        return "M"
-    elif not disallowed_model and contamination:
-        return "C"
-    else:  # not disallowed_model and not contamination
-        return ""
-
-
-def process_method(method_path: str, method_name: str):
-    """
-    For a single method dir (results/method_name), collect the newest run per
-    (benchmark, model), then write a CSV.
-    """
-    # key: (benchmark, model) -> value: {"run_id": int, "path": str}
-    latest_runs = {}
-
-    for entry in os.listdir(method_path):
-        entry_path = os.path.join(method_path, entry)
-        if not os.path.isdir(entry_path):
-            continue
-        try: 
-            benchmark, _, model, run_id = entry.split("_")
-            key = (benchmark, model)
-        except ValueError as e:
-            print(entry)
-            raise ValueError(f"{entry}, {method_path}")
-
-        # keep only highest run_id per (benchmark, model)
-        if key not in latest_runs or run_id > latest_runs[key]["run_id"]:
-            latest_runs[key] = {
-                "run_id": run_id,
-                "path": entry_path,
-            }
-
-    if not latest_runs:
-        # nothing to do for this method
-        return
-
-    # Collect distinct benchmarks and models
-    benchmarks = sorted({b for (b, m) in latest_runs.keys()})
-    models = sorted({m for (b, m) in latest_runs.keys()})
-
-    # Prepare CSV path (next to results/ or inside results/)
-    csv_path = os.path.join(get_results_dir(), f"{OUTPUT_PREFIX}{method_name}.csv")
-
-    with open(csv_path, "w", newline="") as csvfile:
-        writer = csv.writer(csvfile)
-        # header
-        writer.writerow(["model"] + benchmarks)
-
-        # rows
-        for model in models:
-            row = [model]
-            for bench in benchmarks:
-                cell = ""
-                key = (bench, model)
-                if key in latest_runs:
-                    run_dir = latest_runs[key]["path"]
-                    contamination_path = os.path.join(run_dir, "contamination_judgement.txt")
-                    disallowed_path = os.path.join(run_dir, "disallowed_model_judgement.txt")
-                    
-                    contamination = load_contamination(contamination_path)
-                    disallowed_model = load_disallowed_model(disallowed_path)
-                    cell = combine_results(contamination, disallowed_model)
-                row.append(cell)
-            writer.writerow(row)
-
-    print(f"Written: {csv_path}")
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results')
-
-
-def main():
-    results_dir = get_results_dir()
-    for method_name in os.listdir(results_dir):
-        method_path = os.path.join(results_dir, method_name)
-        if not os.path.isdir(method_path):
-            continue
-        # treat every subdirectory of results/ as a "method"
-        process_method(method_path, method_name)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/scripts/aggregate_final.py b/scripts/aggregate_final.py
deleted file mode 100644
index 86becb2b..00000000
--- a/scripts/aggregate_final.py
+++ /dev/null
@@ -1,168 +0,0 @@
-#!/usr/bin/env python3
-"""
-Final aggregation script that combines method results with baseline fallbacks.
-
-For each method's aggregated CSV, replaces values with baseline when:
-1. The value is not a number (e.g., "ERR", "not avl.", etc.), OR
-2. The corresponding contamination value is not empty (flagged as "C", "M", "MC", etc.)
-
-Baseline values come from aggregated_baseline.csv.
-"""
-import os
-import csv
-import argparse
-
-OUTPUT_PREFIX = "final_"
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def is_number(value: str) -> bool:
-    """Check if a string represents a number (int or float)."""
-    if not value:
-        return False
-    try:
-        float(value)
-        return True
-    except ValueError:
-        return False
-
-
-def load_csv_as_dict(csv_path: str) -> tuple[dict, list]:
-    """
-    Load a CSV file into a dict of dicts: {model: {benchmark: value}}.
-    Returns (data_dict, list_of_benchmarks).
-    """
-    data = {}
-    benchmarks = []
-
-    if not os.path.exists(csv_path):
-        return data, benchmarks
-
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader, None)
-        if not header:
-            return data, benchmarks
-
-        # First column is "model", rest are benchmarks
-        benchmarks = header[1:]
-
-        for row in reader:
-            if not row:
-                continue
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(benchmarks):
-                if i + 1 < len(row):
-                    data[model][bench] = row[i + 1]
-                else:
-                    data[model][bench] = ""
-
-    return data, benchmarks
-
-
-def process_method(method_name: str, baseline_data: dict, results_dir: str):
-    """
-    Process a single method: load its aggregated and contamination CSVs,
-    apply baseline fallbacks where needed, and write the final CSV.
-    """
-    aggregated_path = os.path.join(results_dir, f"aggregated_{method_name}.csv")
-    contamination_path = os.path.join(results_dir, f"contamination_{method_name}.csv")
-
-    # Load method data
-    method_data, method_benchmarks = load_csv_as_dict(aggregated_path)
-    if not method_data:
-        return
-
-    # Load contamination data (may not exist)
-    contamination_data, _ = load_csv_as_dict(contamination_path)
-
-    # Get all models from method data
-    models = sorted(method_data.keys())
-
-    # Process each cell and apply baseline if needed
-    for model in models:
-        for bench in method_benchmarks:
-            value = method_data[model].get(bench, "")
-            contamination_value = contamination_data.get(model, {}).get(bench, "")
-
-            # Check conditions for baseline replacement
-            needs_baseline = False
-
-            # Condition 1: value is not a number
-            if not is_number(value):
-                needs_baseline = True
-
-            # Condition 2: contamination value is not empty
-            if contamination_value.strip():
-                needs_baseline = True
-
-            if needs_baseline:
-                # Get baseline value (may be empty if model/bench not in baseline)
-                baseline_value = baseline_data.get(model, {}).get(bench, "")
-                method_data[model][bench] = baseline_value
-
-    # Write output
-    output_path = os.path.join(results_dir, f"{OUTPUT_PREFIX}{method_name}.csv")
-
-    with open(output_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["model"] + method_benchmarks)
-
-        for model in models:
-            row = [model]
-            for bench in method_benchmarks:
-                row.append(method_data[model].get(bench, ""))
-            writer.writerow(row)
-
-    print(f"Written: {output_path}")
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Create final aggregated CSVs with baseline fallbacks."
-    )
-    parser.add_argument(
-        "--methods",
-        nargs="*",
-        default=None,
-        help="Specific methods to process. If not provided, processes all methods.",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    results_dir = get_results_dir()
-
-    # Load baseline data
-    baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv")
-    baseline_data, _ = load_csv_as_dict(baseline_path)
-
-    if not baseline_data:
-        print(f"Warning: No baseline data found at {baseline_path}")
-
-    # Determine which methods to process
-    if args.methods:
-        method_names = args.methods
-    else:
-        # Find all aggregated method files (excluding baseline)
-        method_names = []
-        for filename in os.listdir(results_dir):
-            if not filename.startswith("aggregated_") or not filename.endswith(".csv"):
-                continue
-            method_name = filename[len("aggregated_") : -len(".csv")]
-            # Skip baseline itself
-            if method_name != "baseline_zeroshot":
-                method_names.append(method_name)
-
-    # Process each method
-    for method_name in sorted(method_names):
-        process_method(method_name, baseline_data, results_dir)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/scripts/aggregate_methods.py b/scripts/aggregate_methods.py
deleted file mode 100644
index 734782a5..00000000
--- a/scripts/aggregate_methods.py
+++ /dev/null
@@ -1,152 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os
-import json
-import csv
-
-OUTPUT_PREFIX = "aggregated_"        # e.g. "agg_" if you want names like agg_method.csv
-
-def load_metrics(metrics_path: str, method_name: str = None):
-    """
-    Return a string suitable for the CSV.
-    - Always returns the metrics data if metrics.json exists and is valid
-    - Only shows error messages if metrics.json doesn't exist or is invalid:
-      For non-baseline methods:
-        - "not avl." if time_taken.txt doesn't exist in the folder
-        - "not stored" if time_taken.txt exists but final_model subfolder doesn't
-        - "ERR" for other errors
-      For baseline method:
-        - Just return "ERR" for any errors (old behavior)
-    """
-    # First, try to load metrics.json - if it works, return the data immediately
-    if os.path.exists(metrics_path):
-        try:
-            with open(metrics_path, "r") as f:
-                data = json.load(f)
-            
-            acc = data.get("accuracy")
-            if acc is not None:
-                return str(acc)
-        except Exception:
-            pass  # Fall through to error handling below
-    
-    # Only reach here if metrics.json doesn't exist or is invalid
-    # For baseline, just return "ERR"
-    if method_name == "baseline_zeroshot":
-        return "ERR"
-    
-    # For non-baseline methods, provide more specific error messages
-    run_dir = os.path.dirname(metrics_path)
-    
-    # Check for time_taken.txt
-    time_taken_path = os.path.join(run_dir, "time_taken.txt")
-    if not os.path.exists(time_taken_path):
-        return "not avl."
-    
-    # Check for final_model subdirectory
-    final_model_path = os.path.join(run_dir, "final_model")
-    if not os.path.isdir(final_model_path):
-        return "not stored"
-    
-    # All checks passed but still no valid metrics.json
-    return "ERR"
-
-def process_method(method_path: str, method_name: str, min_run_id=None, max_run_id=None):
-    """
-    For a single method dir (results/method_name), collect the newest run per
-    (benchmark, model), then write a CSV.
-    """
-    # key: (benchmark, model) -> value: {"run_id": int, "path": str}
-    latest_runs = {}
-    
-    for entry in os.listdir(method_path):
-        entry_path = os.path.join(method_path, entry)
-        if not os.path.isdir(entry_path):
-            continue
-        
-        try: 
-            benchmark, _, model, run_id_str = entry.split("_")
-            run_id = int(run_id_str)
-            key = (benchmark, model)
-        except ValueError as e:
-            print(entry)
-            raise ValueError(f"{entry}, {method_path}")
-        
-        if max_run_id is not None and run_id >= max_run_id:
-            continue
-
-        if min_run_id is not None and run_id < min_run_id:
-            continue
-        
-        # keep only highest run_id per (benchmark, model)
-        if key not in latest_runs or run_id > latest_runs[key]["run_id"]:
-            latest_runs[key] = {
-                "run_id": run_id,
-                "path": entry_path,
-            }
-    
-    if not latest_runs:
-        # nothing to do for this method
-        return
-    
-    # Collect distinct benchmarks and models
-    benchmarks = sorted({b for (b, m) in latest_runs.keys()})
-    models = sorted({m for (b, m) in latest_runs.keys()})
-    
-    # Prepare CSV path (next to results/ or inside results/)
-    csv_path = os.path.join(get_results_dir(), f"{OUTPUT_PREFIX}{method_name}.csv")
-    
-    with open(csv_path, "w", newline="") as csvfile:
-        writer = csv.writer(csvfile)
-        # header
-        writer.writerow(["model"] + benchmarks)
-        
-        # rows
-        for model in models:
-            row = [model]
-            for bench in benchmarks:
-                cell = ""
-                key = (bench, model)
-                if key in latest_runs:
-                    run_dir = latest_runs[key]["path"]
-                    metrics_path = os.path.join(run_dir, "metrics.json")
-                    cell = load_metrics(metrics_path, method_name)
-                row.append(cell)
-            writer.writerow(row)
-
-    print(f"Written: {csv_path}")
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results')
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Aggregate latest benchmark runs into CSV files.")
-    parser.add_argument(
-        "--min-run-id",
-        type=int,
-        default=None,
-        help="Inclusive lower bound for run ids to consider.",
-    )
-    parser.add_argument(
-        "--max-run-id",
-        type=int,
-        default=None,
-        help="Exclusive upper bound for run ids to consider.",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    results_dir = get_results_dir()
-    
-    for method_name in os.listdir(results_dir):
-        method_path = os.path.join(results_dir, method_name)
-        if not os.path.isdir(method_path):
-            continue
-        
-        # treat every subdirectory of results/ as a "method"
-        process_method(method_path, method_name, min_run_id=args.min_run_id, max_run_id=args.max_run_id)
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/aggregate_metrics_runs.py b/scripts/aggregate_metrics_runs.py
new file mode 100755
index 00000000..26ef7981
--- /dev/null
+++ b/scripts/aggregate_metrics_runs.py
@@ -0,0 +1,72 @@
+#!/usr/bin/env python3
+"""Aggregate per-run metrics JSON files into a single metrics_averaged.json.
+
+Reads every file matching --runs-glob, treats top-level numeric keys as
+per-run metric values, and writes mean/std/stderr/min/max per key plus the
+raw per-run records and source file list.
+"""
+from __future__ import annotations
+
+import argparse
+import glob
+import json
+import math
+import sys
+
+
+def _numeric(x: object) -> bool:
+    return isinstance(x, (int, float)) and not isinstance(x, bool)
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--runs-glob", required=True,
+                        help="Glob matching per-run metrics JSON files.")
+    parser.add_argument("--output", required=True,
+                        help="Path to write the aggregated metrics JSON.")
+    args = parser.parse_args()
+
+    paths = sorted(glob.glob(args.runs_glob))
+    if not paths:
+        sys.exit(f"no run files matched {args.runs_glob}")
+
+    runs: list[dict] = []
+    for path in paths:
+        with open(path, "r") as f:
+            runs.append(json.load(f))
+
+    keys = sorted({k for r in runs for k in r.keys()})
+
+    aggregated: dict[str, dict[str, float | int]] = {}
+    for k in keys:
+        vals = [r[k] for r in runs if k in r and _numeric(r[k])]
+        if not vals:
+            continue
+        mean = sum(vals) / len(vals)
+        if len(vals) > 1:
+            variance = sum((x - mean) ** 2 for x in vals) / (len(vals) - 1)
+            std = math.sqrt(variance)
+        else:
+            std = 0.0
+        aggregated[k] = {
+            "mean": mean,
+            "std": std,
+            "stderr": std / math.sqrt(len(vals)),
+            "min": min(vals),
+            "max": max(vals),
+            "n": len(vals),
+        }
+
+    out = {
+        "n_runs": len(runs),
+        "metrics": aggregated,
+        "per_run": runs,
+        "run_files": paths,
+    }
+
+    with open(args.output, "w") as f:
+        json.dump(out, f, indent=2)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/aggregate_summary.py b/scripts/aggregate_summary.py
deleted file mode 100644
index 169e30fa..00000000
--- a/scripts/aggregate_summary.py
+++ /dev/null
@@ -1,191 +0,0 @@
-#!/usr/bin/env python3
-"""
-Aggregate final_{method}.csv files into a single summary CSV.
-
-For each benchmark, computes the average score across models per method.
-
-Output format:
-- Rows: benchmarks
-- Columns: baseline_base, baseline_instruct, method1, method2, ...
-- Values: average score across models
-"""
-import os
-import csv
-import argparse
-
-METHOD_NAME_MAP = {
-    "claude_claude-sonnet-4-5": "claude sonnet 4.5",
-    "claude_claude-opus-4-5": "claude opus 4.5",
-    "codex_gpt-5.1-codex-max": "gpt-5.1-codex-max",
-    "codex_gpt-5.2": "gpt-5.2",
-    "gemini_models_gemini-3-pro-preview": "gemini-3-pro",
-    "opencode_anthropic_claude-sonnet-4-5": "opencode claude-sonnet-4-5",
-    "opencode_anthropic_claude-opus-4-5_10h": "opencode claude-opus-4-5",
-    "opencode_opencode_big-pickle_10h": "opencode big-pickle",
-    "opencode_opencode_gemini-3-pro_10h": "opencode gemini-3-pro",
-    "opencode_opencode_glm-4.7-free_10h": "opencode glm-4.7",
-    "opencode_opencode_gpt-5.1-codex-max_10h": "opencode gpt-5.1-codex-max",
-    "opencode_opencode_kimi-k2-thinking_10h": "opencode kimi-k2-thinking",
-    "opencode_opencode_minimax-m2.1-free_10h": "opencode minimax-m2.1",
-}
-
-# Model groups for baseline columns
-BASE_MODELS = ["Qwen3-1.7B-Base", "Qwen3-4B-Base", "SmolLM3-3B-Base", "gemma-3-4b-pt"]
-INSTRUCT_MODELS = ["Qwen3-1.7B", "Qwen3-4B", "SmolLM3-3B", "gemma-3-4b-it"]
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def load_csv_as_dict(csv_path: str) -> tuple[dict, list]:
-    """
-    Load a CSV file into a dict of dicts: {model: {benchmark: value}}.
-    Returns (data_dict, list_of_benchmarks).
-    """
-    data = {}
-    benchmarks = []
-
-    if not os.path.exists(csv_path):
-        return data, benchmarks
-
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader, None)
-        if not header:
-            return data, benchmarks
-
-        benchmarks = header[1:]
-
-        for row in reader:
-            if not row:
-                continue
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(benchmarks):
-                if i + 1 < len(row):
-                    data[model][bench] = row[i + 1]
-                else:
-                    data[model][bench] = ""
-
-    return data, benchmarks
-
-
-def compute_benchmark_average(data: dict, benchmark: str, models: list = None) -> str:
-    """
-    Compute average score for a benchmark across specified models.
-    If models is None, uses all models in data.
-    Returns empty string if no valid scores found.
-    """
-    if models is None:
-        models = list(data.keys())
-
-    values = []
-    for model in models:
-        val_str = data.get(model, {}).get(benchmark, "")
-        if val_str:
-            try:
-                values.append(float(val_str))
-            except ValueError:
-                pass
-
-    if not values:
-        return ""
-
-    return f"{sum(values) / len(values):.4f}"
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Aggregate final CSVs into a single summary with model averages per benchmark."
-    )
-    parser.add_argument(
-        "methods",
-        nargs="+",
-        help="List of methods to include in the aggregation.",
-    )
-    parser.add_argument(
-        "-o", "--output",
-        default=None,
-        help="Output CSV filename. Default: summary.csv in results dir.",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    results_dir = get_results_dir()
-
-    # Load baseline data
-    baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv")
-    baseline_data, baseline_benchmarks = load_csv_as_dict(baseline_path)
-
-    if not baseline_data:
-        print(f"Error: No baseline data found at {baseline_path}")
-        return
-
-    # Load all method data
-    method_data = {}
-    method_benchmarks = {}
-
-    for method in args.methods:
-        final_path = os.path.join(results_dir, f"final_{method}.csv")
-        data, benchmarks = load_csv_as_dict(final_path)
-
-        if not data:
-            print(f"Warning: No data found for method '{method}' at {final_path}")
-            continue
-
-        method_data[method] = data
-        method_benchmarks[method] = set(benchmarks)
-
-    if not method_data:
-        print("Error: No valid method data found.")
-        return
-
-    # Find common benchmarks across baseline and all methods
-    common_benchmarks = set(baseline_benchmarks)
-    for method, benchmarks in method_benchmarks.items():
-        common_benchmarks &= benchmarks
-
-    common_benchmarks = sorted(common_benchmarks)
-
-    if not common_benchmarks:
-        print("Error: No common benchmarks found across all files.")
-        return
-
-    print(f"Common benchmarks ({len(common_benchmarks)}): {', '.join(common_benchmarks)}")
-
-    # Prepare output
-    output_path = args.output or os.path.join(results_dir, "summary.csv")
-    methods_ordered = [m for m in args.methods if m in method_data]
-
-    with open(output_path, "w", newline="") as f:
-        writer = csv.writer(f)
-
-        # Header: benchmark, baseline_base, baseline_instruct, method1, method2, ...
-        # Apply METHOD_NAME_MAP to simplify method names in the header
-        display_methods = [METHOD_NAME_MAP.get(m, m) for m in methods_ordered]
-        writer.writerow(["benchmark", "baseline_base", "baseline_instruct"] + display_methods)
-
-        # Benchmark rows
-        for bench in common_benchmarks:
-            row = [bench]
-
-            # Baseline base models average
-            row.append(compute_benchmark_average(baseline_data, bench, BASE_MODELS))
-
-            # Baseline instruct models average
-            row.append(compute_benchmark_average(baseline_data, bench, INSTRUCT_MODELS))
-
-            # Method averages (over all models in each method's file)
-            for method in methods_ordered:
-                row.append(compute_benchmark_average(method_data[method], bench))
-
-            writer.writerow(row)
-
-    print(f"Written: {output_path}")
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/scripts/aggregate_time.py b/scripts/aggregate_time.py
deleted file mode 100644
index 308276c0..00000000
--- a/scripts/aggregate_time.py
+++ /dev/null
@@ -1,196 +0,0 @@
-#!/usr/bin/env python3
-import argparse
-import os
-import csv
-import re
-
-OUTPUT_PREFIX = "aggregated_time_"  # e.g. aggregated_time_method.csv
-OVERVIEW_FILENAME = "aggregated_time_overview.csv"
-BUDGET_SECONDS = 10 * 3600  # 10 hours
-
-
-def parse_time_hms(time_str: str) -> int | None:
-    """
-    Parse a time string in H:M:S format and return total seconds.
-    Returns None if parsing fails.
-    """
-    match = re.match(r'^(\d+):(\d{1,2}):(\d{1,2})$', time_str.strip())
-    if not match:
-        return None
-    hours, minutes, seconds = map(int, match.groups())
-    if minutes >= 60 or seconds >= 60:
-        return None
-    return hours * 3600 + minutes * 60 + seconds
-
-
-def format_time_hms(total_seconds: int) -> str:
-    """Convert total seconds to H:M:S format."""
-    hours = total_seconds // 3600
-    minutes = (total_seconds % 3600) // 60
-    seconds = total_seconds % 60
-    return f"{hours}:{minutes:02d}:{seconds:02d}"
-
-
-def load_time_taken(run_dir: str) -> tuple[str, int | None]:
-    """
-    Return the time taken as (display_string, total_seconds).
-    - Returns (H:M:S string, seconds) if valid
-    - Returns ("ERR", None) if time_taken.txt doesn't exist or is invalid
-    """
-    time_taken_path = os.path.join(run_dir, "time_taken.txt")
-    
-    if not os.path.exists(time_taken_path):
-        return "ERR", None
-    
-    try:
-        with open(time_taken_path, "r") as f:
-            time_str = f.read().strip()
-        total_seconds = parse_time_hms(time_str)
-        if total_seconds is None:
-            return "ERR", None
-        return format_time_hms(total_seconds), total_seconds
-    except Exception:
-        return "ERR", None
-
-
-def process_method(method_path: str, method_name: str, min_run_id=None, max_run_id=None) -> dict:
-    """
-    For a single method dir (results/method_name), collect the newest run per
-    (benchmark, model), then write a CSV.
-    
-    Returns a dict with timing statistics for the overview.
-    """
-    # key: (benchmark, model) -> value: {"run_id": int, "path": str}
-    latest_runs = {}
-    
-    for entry in os.listdir(method_path):
-        entry_path = os.path.join(method_path, entry)
-        if not os.path.isdir(entry_path):
-            continue
-        
-        try: 
-            benchmark, _, model, run_id_str = entry.split("_")
-            run_id = int(run_id_str)
-            key = (benchmark, model)
-        except ValueError as e:
-            print(entry)
-            raise ValueError(f"{entry}, {method_path}")
-        
-        if max_run_id is not None and run_id >= max_run_id:
-            continue
-        if min_run_id is not None and run_id < min_run_id:
-            continue
-        
-        # keep only highest run_id per (benchmark, model)
-        if key not in latest_runs or run_id > latest_runs[key]["run_id"]:
-            latest_runs[key] = {
-                "run_id": run_id,
-                "path": entry_path,
-            }
-    
-    if not latest_runs:
-        return {}
-    
-    benchmarks = sorted({b for (b, m) in latest_runs.keys()})
-    models = sorted({m for (b, m) in latest_runs.keys()})
-    
-    csv_path = os.path.join(get_results_dir(), f"{OUTPUT_PREFIX}{method_name}.csv")
-    
-    # Collect timing stats for overview
-    total_seconds = 0
-    valid_count = 0
-    
-    with open(csv_path, "w", newline="") as csvfile:
-        writer = csv.writer(csvfile)
-        writer.writerow(["model"] + benchmarks)
-        
-        for model in models:
-            row = [model]
-            for bench in benchmarks:
-                cell = ""
-                key = (bench, model)
-                if key in latest_runs:
-                    run_dir = latest_runs[key]["path"]
-                    cell, seconds = load_time_taken(run_dir)
-                    if seconds is not None:
-                        total_seconds += seconds
-                        valid_count += 1
-                row.append(cell)
-            writer.writerow(row)
-    print(f"Written: {csv_path}")
-    
-    return {
-        "total_seconds": total_seconds,
-        "valid_count": valid_count,
-    }
-
-
-def write_overview(method_stats: dict[str, dict]):
-    """Write an overview CSV with average times per method."""
-    csv_path = os.path.join(get_results_dir(), OVERVIEW_FILENAME)
-    
-    with open(csv_path, "w", newline="") as csvfile:
-        writer = csv.writer(csvfile)
-        writer.writerow(["method", "average_time", "percentage"])
-        
-        for method_name in sorted(method_stats.keys()):
-            stats = method_stats[method_name]
-            total_secs = stats["total_seconds"]
-            valid = stats["valid_count"]
-            
-            if valid > 0:
-                avg_secs = total_secs // valid
-                avg_str = format_time_hms(avg_secs)
-                pct = (avg_secs / BUDGET_SECONDS) * 100
-                pct_str = f"{pct:.1f}%"
-            else:
-                avg_str = "N/A"
-                pct_str = "N/A"
-            
-            writer.writerow([method_name, avg_str, pct_str])
-    
-    print(f"Written: {csv_path}")
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results')
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(description="Aggregate latest benchmark run times into CSV files.")
-    parser.add_argument(
-        "--min-run-id",
-        type=int,
-        default=None,
-        help="Inclusive lower bound for run ids to consider.",
-    )
-    parser.add_argument(
-        "--max-run-id",
-        type=int,
-        default=None,
-        help="Exclusive upper bound for run ids to consider.",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    results_dir = get_results_dir()
-    
-    method_stats = {}
-    
-    for method_name in os.listdir(results_dir):
-        method_path = os.path.join(results_dir, method_name)
-        if not os.path.isdir(method_path):
-            continue
-        
-        stats = process_method(method_path, method_name, min_run_id=args.min_run_id, max_run_id=args.max_run_id)
-        if stats:
-            method_stats[method_name] = stats
-    
-    if method_stats:
-        write_overview(method_stats)
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/scripts/aggregate_time_avg_stddev.py b/scripts/aggregate_time_avg_stddev.py
deleted file mode 100755
index 59363bb8..00000000
--- a/scripts/aggregate_time_avg_stddev.py
+++ /dev/null
@@ -1,101 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compute averages and standard deviations for time taken across multiple runs.
-
-Reads from aggregated_time_overview.csv and computes statistics for each agent
-group defined in HARDCODED_AGENT_MAP.
-"""
-import os
-import csv
-import math
-
-from constants import HARDCODED_AGENT_MAP
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def mean(values: list[float]) -> float:
-    return sum(values) / len(values)
-
-
-def stddev(values: list[float]) -> float:
-    avg = mean(values)
-    variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1)
-    return math.sqrt(variance)
-
-
-def parse_time_to_hours(time_str: str) -> float:
-    """Parse time string like '8:17:28' or '0:55:17' to hours as float."""
-    parts = time_str.split(":")
-    hours = int(parts[0])
-    minutes = int(parts[1])
-    seconds = int(parts[2])
-    return hours + minutes / 60 + seconds / 3600
-
-
-def load_time_csv(csv_path: str) -> dict[str, float]:
-    """
-    Load time CSV into dict: {method: hours}.
-    """
-    data = {}
-
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            method = row["method"]
-            data[method] = parse_time_to_hours(row["average_time"])
-
-    return data
-
-
-def format_hours_to_time(hours: float) -> str:
-    """Convert hours float back to H:MM:SS format."""
-    total_seconds = int(hours * 3600)
-    h = total_seconds // 3600
-    m = (total_seconds % 3600) // 60
-    s = total_seconds % 60
-    return f"{h}:{m:02d}:{s:02d}"
-
-
-def main():
-    results_dir = get_results_dir()
-    time_csv_path = os.path.join(results_dir, "aggregated_time_overview.csv")
-
-    # Load time data
-    time_data = load_time_csv(time_csv_path)
-
-    # Compute aggregated statistics for each agent group
-    aggregated_results = {}
-
-    for agent_name, method_names in HARDCODED_AGENT_MAP.items():
-        hours_list = []
-
-        for method_name in method_names:
-            hours_list.append(time_data[method_name])
-
-        aggregated_results[agent_name] = {
-            "avg_hours": mean(hours_list),
-            "std_hours": stddev(hours_list),
-            "n": len(hours_list),
-        }
-
-    # Write aggregated time CSV
-    output_path = os.path.join(results_dir, "time_aggregated.csv")
-    with open(output_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["agent", "avg_time", "std_time", "n"])
-        for agent_name in HARDCODED_AGENT_MAP.keys():
-            data = aggregated_results[agent_name]
-            writer.writerow([
-                agent_name,
-                format_hours_to_time(data["avg_hours"]),
-                format_hours_to_time(data["std_hours"]),
-                data["n"],
-            ])
-    print(f"Written: {output_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/aggregate_time_baselines.py b/scripts/aggregate_time_baselines.py
deleted file mode 100644
index 38eec5ec..00000000
--- a/scripts/aggregate_time_baselines.py
+++ /dev/null
@@ -1,128 +0,0 @@
-import os
-import csv
-from pathlib import Path
-from collections import defaultdict
-
-def parse_directory_name(dirname):
-    """Extract benchmark and model from directory name."""
-    parts = dirname.split('_')
-
-    benchmarks = []
-    benchmarks_path = Path('src/eval/tasks')
-    for item in benchmarks_path.iterdir():
-        benchmarks.append(item.name)
-
-    for benchmark in benchmarks:
-        prefix = benchmark + '_'
-        if dirname.startswith(prefix):
-            rest = dirname[len(prefix):]
-            break
-    else:
-        return None, None, None
-    
-    # Extract model name and run ID
-    parts = rest.rsplit('_', 1)
-    if len(parts) == 2:
-        model = parts[0].replace('_', '/')  # Convert back to org/model format
-        run_id = parts[1]
-        return benchmark, model, run_id
-    
-    return None, None, None
-
-def get_latest_results(results_dir):
-    """Get the latest run for each model-benchmark combination."""
-    results_path = Path(results_dir)
-    
-    # Store all runs grouped by (benchmark, model)
-    runs = defaultdict(list)
-    
-    # Scan all directories
-    for subdir in results_path.glob('baseline_zeroshot/*'):
-        if subdir.is_dir():
-            benchmark, model, run_id = parse_directory_name(subdir.name)
-            
-            if benchmark and model and run_id:
-                time_file = subdir / 'time_taken.txt'
-                runs[(benchmark, model)].append({
-                    'path': time_file,
-                    'run_id': run_id,
-                    'dir_name': subdir.name,
-                    'exists': time_file.exists()
-                })
-    
-    # Get the latest run for each combination (highest run_id)
-    latest_runs = {}
-    for (benchmark, model), run_list in runs.items():
-        latest = max(run_list, key=lambda x: x['run_id'])
-        latest_runs[(benchmark, model)] = {
-            'path': latest['path'],
-            'exists': latest['exists']
-        }
-    
-    return latest_runs
-
-def read_time_taken(time_path):
-    """Read time_taken.txt and return the time string, removing leading zeros."""
-    with open(time_path, 'r') as f:
-        time_str = f.read().strip()
-    
-    # If format is HH:MM:SS and hours are 00, strip to MM:SS
-    if ':' in time_str:
-        parts = time_str.split(':')
-        if len(parts) == 3 and parts[0] == '00':
-            time_str = f"{parts[1]}:{parts[2]}"
-    
-    return time_str
-
-def create_results_csv(results_dir, output_file='benchmark_times.csv'):
-    """Create a CSV file with aggregated time taken results."""
-    # Get latest results
-    latest_runs = get_latest_results(results_dir)
-    
-    # Collect all unique benchmarks and models
-    benchmarks = sorted(set(b for b, m in latest_runs.keys()))
-    models = sorted(set(m for b, m in latest_runs.keys()))
-    
-    # Create results matrix
-    results = {}
-    for model in models:
-        results[model] = {}
-        for benchmark in benchmarks:
-            if (benchmark, model) in latest_runs:
-                run_info = latest_runs[(benchmark, model)]
-                if run_info['exists']:
-                    results[model][benchmark] = read_time_taken(str(run_info['path']))
-                else:
-                    results[model][benchmark] = 'ERR'
-            else:
-                results[model][benchmark] = 'N/A'
-    
-    # Write to CSV
-    with open(output_file, 'w', newline='') as f:
-        writer = csv.writer(f)
-        
-        # Header row
-        writer.writerow(['Model'] + benchmarks)
-        
-        # Data rows - prefix times with single quote to force string interpretation
-        for model in models:
-            row = [model] + [f"'{results[model][b]}" for b in benchmarks]
-            writer.writerow(row)
-    
-    print(f"Results saved to {output_file}")
-    print(f"Models: {len(models)}")
-    print(f"Benchmarks: {len(benchmarks)}")
-    print(f"Total entries: {len(latest_runs)}")
-    
-    # Count ERR entries
-    err_count = sum(1 for model in models for b in benchmarks if results[model][b] == 'ERR')
-    if err_count > 0:
-        print(f"Warning: {err_count} entries missing time_taken.txt (marked as ERR)")
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results')
-
-if __name__ == '__main__':
-    results_dir = get_results_dir()
-    
-    create_results_csv(results_dir, os.path.join(results_dir, 'benchmark_times.csv'))
\ No newline at end of file
diff --git a/scripts/aggregate_together.py b/scripts/aggregate_together.py
deleted file mode 100644
index 7b167946..00000000
--- a/scripts/aggregate_together.py
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/env python3
-"""
-Aggregate final_{method}.csv files into a single concatenated CSV.
-
-Output format:
-- Method name row
-- Header row (model, benchmark1, benchmark2, ...)
-- Data rows for that method
-- Blank line
-- Next method...
-"""
-import os
-import csv
-import argparse
-
-METHOD_NAME_MAP = {
-    "claude_claude-sonnet-4-5_final_v3": "claude sonnet 4.5",
-    "claude_claude-opus-4-5_final_v3": "claude opus 4.5",
-    "codex_gpt-5.1-codex-max_final_v3": "gpt-5.1-codex-max",
-    "codex_gpt-5.2_final_v3": "gpt-5.2",
-    "gemini_models_gemini-3-pro-preview_final_v3": "gemini-3-pro",
-}
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def load_csv_rows(csv_path: str) -> tuple[list, list]:
-    """
-    Load a CSV file and return (header, rows).
-    """
-    header = []
-    rows = []
-
-    if not os.path.exists(csv_path):
-        return header, rows
-
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader, None)
-        if not header:
-            return [], []
-
-        for row in reader:
-            if row:
-                rows.append(row)
-
-    return header, rows
-
-
-def parse_args():
-    parser = argparse.ArgumentParser(
-        description="Aggregate final CSVs into a single concatenated file."
-    )
-    parser.add_argument(
-        "methods",
-        nargs="+",
-        help="List of methods to include in the aggregation.",
-    )
-    parser.add_argument(
-        "-o", "--output",
-        default=None,
-        help="Output CSV filename. Default: summary_concat.csv in results dir.",
-    )
-    parser.add_argument(
-        "--include-baseline",
-        action="store_true",
-        help="Include baseline data as the first section.",
-    )
-    return parser.parse_args()
-
-
-def main():
-    args = parse_args()
-    results_dir = get_results_dir()
-
-    output_path = args.output or os.path.join(results_dir, "summary_concat.csv")
-
-    with open(output_path, "w", newline="") as f:
-        writer = csv.writer(f)
-
-        # Optionally include baseline first
-        if args.include_baseline:
-            baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv")
-            header, rows = load_csv_rows(baseline_path)
-
-            if header and rows:
-                writer.writerow(["baseline"])
-                writer.writerow(header)
-                for row in rows:
-                    writer.writerow(row)
-                writer.writerow([])  # blank line
-            else:
-                print(f"Warning: No baseline data found at {baseline_path}")
-
-        # Process each method
-        for method in args.methods:
-            final_path = os.path.join(results_dir, f"final_{method}.csv")
-            header, rows = load_csv_rows(final_path)
-
-            if not header or not rows:
-                print(f"Warning: No data found for method '{method}' at {final_path}")
-                continue
-
-            display_name = METHOD_NAME_MAP.get(method, method)
-
-            # Method name row
-            writer.writerow([display_name])
-            # Header row
-            writer.writerow(header)
-            # Data rows
-            for row in rows:
-                writer.writerow(row)
-            # Blank line
-            writer.writerow([])
-
-    print(f"Written: {output_path}")
-
-
-if __name__ == "__main__":
-    main()
\ No newline at end of file
diff --git a/scripts/baselines.json b/scripts/baselines.json
new file mode 100644
index 00000000..d80799c3
--- /dev/null
+++ b/scripts/baselines.json
@@ -0,0 +1,114 @@
+{
+    "zeroshot": {
+        "Qwen3-1.7B": {
+            "aime2025": 0.26666666666666666,
+            "arenahardwriting": 0.5,
+            "bfcl": 0.94,
+            "gpqamain": 0.3549107142857143,
+            "gsm8k": 0.8847611827141774,
+            "healthbench": 0.44918867035528026,
+            "humaneval": 0.6890243902439024
+        },
+        "Qwen3-1.7B-Base": {
+            "aime2025": 0.0,
+            "arenahardwriting": 0.009142053445850914,
+            "bfcl": 0.0,
+            "gpqamain": 0.140625,
+            "gsm8k": 0.12661106899166036,
+            "healthbench": 0.07537565969807473,
+            "humaneval": 0.07926829268292683
+        },
+        "Qwen3-4B": {
+            "aime2025": 0.5333333333333333,
+            "arenahardwriting": 0.8683943089430894,
+            "bfcl": 0.95,
+            "gpqamain": 0.44642857142857145,
+            "gsm8k": 0.9378316906747536,
+            "healthbench": 0.5272399437524256,
+            "humaneval": 0.774390243902439
+        },
+        "Qwen3-4B-Base": {
+            "aime2025": 0.03333333333333333,
+            "arenahardwriting": 0.03417533432392273,
+            "bfcl": 0.0,
+            "gpqamain": 0.13392857142857142,
+            "gsm8k": 0.4184988627748294,
+            "healthbench": 0.13383521639663787,
+            "humaneval": 0.36585365853658536
+        },
+        "SmolLM3-3B": {
+            "aime2025": 0.26666666666666666,
+            "arenahardwriting": 0.492,
+            "bfcl": 0.84,
+            "gpqamain": 0.3325892857142857,
+            "gsm8k": 0.8218347232752085,
+            "healthbench": 0.2957717718639611,
+            "humaneval": 0.7012195121951219
+        },
+        "SmolLM3-3B-Base": {
+            "aime2025": 0.03333333333333333,
+            "arenahardwriting": 0.004225352112676056,
+            "bfcl": 0.0,
+            "gpqamain": 0.049107142857142856,
+            "gsm8k": 0.21076573161485973,
+            "healthbench": 0.0,
+            "humaneval": 0.06097560975609756
+        },
+        "gemma-3-4b-it": {
+            "aime2025": 0.1,
+            "arenahardwriting": 0.948,
+            "bfcl": 0.67,
+            "gpqamain": 0.31473214285714285,
+            "gsm8k": 0.8354814253222138,
+            "healthbench": 0.46063396051286026,
+            "humaneval": 0.6951219512195121
+        },
+        "gemma-3-4b-pt": {
+            "aime2025": 0.0,
+            "arenahardwriting": 0.0028530670470756064,
+            "bfcl": 0.06,
+            "gpqamain": 0.015625,
+            "gsm8k": 0.06141015921152388,
+            "healthbench": 0.17039403723633986,
+            "humaneval": 0.006097560975609756
+        }
+    },
+    "fewshot": {
+        "Qwen3-1.7B-Base": {
+            "aime2025": 0.05333333333333333,
+            "arenahardwriting": 0.05314625850340136,
+            "bfcl": 0.0,
+            "gpqamain": 0.25959821428571417,
+            "gsm8k": 0.46679302501895537,
+            "healthbench": 0.2110110691560308,
+            "humaneval": 0.25243902439024396
+        },
+        "Qwen3-4B-Base": {
+            "aime2025": 0.09000000000000001,
+            "arenahardwriting": 0.19168260038240917,
+            "bfcl": 0.0,
+            "gpqamain": 0.29888392857142837,
+            "gsm8k": 0.7438210765731573,
+            "healthbench": 0.2179351466647625,
+            "humaneval": 0.6774390243902438
+        },
+        "SmolLM3-3B-Base": {
+            "aime2025": 0.06000000000000001,
+            "arenahardwriting": 0.03248811410459588,
+            "bfcl": 0.0,
+            "gpqamain": 0.13236607142857182,
+            "gsm8k": 0.5298711144806676,
+            "healthbench": 0.10165123092180756,
+            "humaneval": 0.3237804878048783
+        },
+        "gemma-3-4b-pt": {
+            "aime2025": 0.0,
+            "arenahardwriting": 0.01257396449704142,
+            "bfcl": 0.06699999999999998,
+            "gpqamain": 0.21406249999999985,
+            "gsm8k": 0.0583775587566339,
+            "healthbench": 0.23317845064882012,
+            "humaneval": 0.004878048780487805
+        }
+    }
+}
diff --git a/scripts/collect.py b/scripts/collect.py
new file mode 100644
index 00000000..fa366ece
--- /dev/null
+++ b/scripts/collect.py
@@ -0,0 +1,259 @@
+#!/usr/bin/env python3
+"""
+Collect results from raw run directories into per-method CSVs.
+
+For each method directory in the results dir, does a single pass:
+  1. Finds the latest run per (benchmark, model)
+  2. Reads metrics.json, contamination files, and time_taken.txt
+  3. Applies baseline fallback for contaminated or errored cells
+  4. Writes final_{method}.csv, contamination_{method}.csv
+
+Also writes a time_overview.csv summarising average time per method.
+
+Usage:
+    python collect.py
+    python collect.py --data-dir /path/to/results --output-dir /path/to/output
+    python collect.py --min-run-id 100 --max-run-id 200
+"""
+import argparse
+import csv
+import os
+
+from utils import (
+    get_results_dir,
+    get_baseline_fallback_data,
+    walk_latest_runs,
+    load_metrics,
+    load_contamination,
+    load_disallowed_model,
+    combine_contamination_results,
+    load_time_taken,
+    is_number,
+    format_time_hms,
+    BUDGET_SECONDS,
+)
+
+# Directories to skip (baselines are hardcoded in baselines.json)
+SKIP_METHODS = {"baseline", "baseline_zeroshot"}
+
+
+def collect_method(
+    method_path: str,
+    method_name: str,
+    baseline_data: dict[str, dict[str, str]],
+    output_dir: str,
+    min_run_id: int | None = None,
+    max_run_id: int | None = None,
+) -> dict | None:
+    """
+    Collect results for one method directory.
+
+    Writes:
+      - final_{method_name}.csv      (scores with baseline fallback)
+      - contamination_{method_name}.csv (contamination flags)
+
+    Returns time stats dict {"total_seconds": int, "valid_count": int}
+    or None if no runs found.
+    """
+    latest_runs = walk_latest_runs(method_path, min_run_id, max_run_id)
+    if not latest_runs:
+        return None
+
+    benchmarks = sorted({b for b, m in latest_runs})
+    models = sorted({m for b, m in latest_runs})
+
+    # Collect metrics, contamination, and time in one pass
+    metrics_grid = {}  # {model: {bench: str}}
+    contamination_grid = {}  # {model: {bench: str}}
+    time_total_seconds = 0
+    time_valid_count = 0
+
+    for model in models:
+        metrics_grid[model] = {}
+        contamination_grid[model] = {}
+
+        for bench in benchmarks:
+            key = (bench, model)
+            if key not in latest_runs:
+                metrics_grid[model][bench] = ""
+                contamination_grid[model][bench] = ""
+                continue
+
+            run_dir = latest_runs[key]["path"]
+
+            # Metrics
+            metrics_path = os.path.join(run_dir, "metrics.json")
+            metrics_grid[model][bench] = load_metrics(metrics_path, method_name)
+
+            # Contamination
+            contamination = load_contamination(
+                os.path.join(run_dir, "contamination_judgement.txt")
+            )
+            disallowed = load_disallowed_model(
+                os.path.join(run_dir, "disallowed_model_judgement.txt")
+            )
+            contamination_grid[model][bench] = combine_contamination_results(
+                contamination, disallowed
+            )
+
+            # Time
+            _, seconds = load_time_taken(run_dir)
+            if seconds is not None:
+                time_total_seconds += seconds
+                time_valid_count += 1
+
+    # Write contamination CSV
+    contamination_path = os.path.join(
+        output_dir, f"contamination_{method_name}.csv"
+    )
+    with open(contamination_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["model"] + benchmarks)
+        for model in models:
+            row = [model]
+            for bench in benchmarks:
+                row.append(contamination_grid[model][bench])
+            writer.writerow(row)
+    print(f"Written: {contamination_path}")
+
+    # Apply baseline fallback: replace cell with baseline if
+    #   (a) value is not a number, OR
+    #   (b) contamination flag is non-empty
+    for model in models:
+        for bench in benchmarks:
+            value = metrics_grid[model][bench]
+            contamination_value = contamination_grid[model][bench]
+
+            reasons = []
+            if not is_number(value):
+                reasons.append(f"non-numeric value ({value!r})")
+            if contamination_value.strip():
+                reasons.append(
+                    f"contamination flag ({contamination_value.strip()!r})"
+                )
+
+            if not reasons:
+                continue
+
+            if model not in baseline_data or bench not in baseline_data[model]:
+                raise KeyError(
+                    f"baselines.json missing entry for model={model!r} "
+                    f"benchmark={bench!r}; needed as fallback in method "
+                    f"{method_name!r} (triggered by {', '.join(reasons)})"
+                )
+            metrics_grid[model][bench] = baseline_data[model][bench]
+
+    # Write final CSV (scores with baseline fallback applied)
+    final_path = os.path.join(output_dir, f"final_{method_name}.csv")
+    with open(final_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["model"] + benchmarks)
+        for model in models:
+            row = [model]
+            for bench in benchmarks:
+                row.append(metrics_grid[model].get(bench, ""))
+            writer.writerow(row)
+    print(f"Written: {final_path}")
+
+    return {
+        "total_seconds": time_total_seconds,
+        "valid_count": time_valid_count,
+    }
+
+
+def write_time_overview(method_stats: dict[str, dict], output_dir: str):
+    """Write time_overview.csv with average time per method."""
+    csv_path = os.path.join(output_dir, "time_overview.csv")
+
+    with open(csv_path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["method", "average_time", "percentage"])
+
+        for method_name in sorted(method_stats.keys()):
+            stats = method_stats[method_name]
+            total_secs = stats["total_seconds"]
+            valid = stats["valid_count"]
+
+            if valid > 0:
+                avg_secs = total_secs // valid
+                avg_str = format_time_hms(avg_secs)
+                pct = (avg_secs / BUDGET_SECONDS) * 100
+                pct_str = f"{pct:.1f}%"
+            else:
+                avg_str = "N/A"
+                pct_str = "N/A"
+
+            writer.writerow([method_name, avg_str, pct_str])
+
+    print(f"Written: {csv_path}")
+
+
+def parse_args():
+    parser = argparse.ArgumentParser(
+        description="Collect raw results into per-method CSVs."
+    )
+    parser.add_argument(
+        "--data-dir",
+        default=None,
+        help="Directory containing method subdirectories with raw run data. "
+        "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.",
+    )
+    parser.add_argument(
+        "--output-dir",
+        default=None,
+        help="Directory to write output CSVs. Defaults to same as --data-dir.",
+    )
+    parser.add_argument(
+        "--min-run-id",
+        type=int,
+        default=None,
+        help="Inclusive lower bound for run IDs to consider.",
+    )
+    parser.add_argument(
+        "--max-run-id",
+        type=int,
+        default=None,
+        help="Exclusive upper bound for run IDs to consider.",
+    )
+    return parser.parse_args()
+
+
+def main():
+    args = parse_args()
+
+    data_dir = args.data_dir or get_results_dir()
+    output_dir = args.output_dir or data_dir
+
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Load baseline data for fallback (hardcoded in baselines.json)
+    baseline_data = get_baseline_fallback_data()
+
+    method_stats = {}
+
+    for method_name in sorted(os.listdir(data_dir)):
+        method_path = os.path.join(data_dir, method_name)
+        if not os.path.isdir(method_path):
+            continue
+
+        # Skip baseline directories — their values are hardcoded
+        if method_name in SKIP_METHODS:
+            continue
+
+        stats = collect_method(
+            method_path,
+            method_name,
+            baseline_data,
+            output_dir,
+            min_run_id=args.min_run_id,
+            max_run_id=args.max_run_id,
+        )
+        if stats:
+            method_stats[method_name] = stats
+
+    if method_stats:
+        write_time_overview(method_stats, output_dir)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/compute_baseline_metrics.py b/scripts/compute_baseline_metrics.py
deleted file mode 100755
index ee31e81a..00000000
--- a/scripts/compute_baseline_metrics.py
+++ /dev/null
@@ -1,121 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compute metrics for baseline models (base and instruct-tuned) using factors from factors.json.
-
-Reads aggregated_baseline.csv and computes weighted metrics per model.
-"""
-import os
-import csv
-import json
-import argparse
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json")
-
-# Mapping from CSV model names to factors.json model names
-MODEL_NAME_MAPPING = {
-    # Base/pretrained models
-    "Qwen3-1.7B-Base": "Qwen3-1.7B",
-    "Qwen3-4B-Base": "Qwen3-4B",
-    "SmolLM3-3B-Base": "SmolLM3-3B",
-    "gemma-3-4b-pt": "gemma-3-4b",
-    # Instruct-tuned models
-    "Qwen3-1.7B": "Qwen3-1.7B",
-    "Qwen3-4B": "Qwen3-4B",
-    "SmolLM3-3B": "SmolLM3-3B",
-    "gemma-3-4b-it": "gemma-3-4b",
-}
-
-BASE_MODELS = {"Qwen3-1.7B-Base", "Qwen3-4B-Base", "SmolLM3-3B-Base", "gemma-3-4b-pt"}
-INSTRUCT_MODELS = {"Qwen3-1.7B", "Qwen3-4B", "SmolLM3-3B", "gemma-3-4b-it"}
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def load_factors(factors_path: str) -> dict:
-    """Load factors from JSON file."""
-    with open(factors_path, "r") as f:
-        return json.load(f)
-
-
-def load_baseline_csv(csv_path: str) -> tuple[dict, list]:
-    """
-    Load baseline CSV file into a dict: {model: {benchmark: value}}.
-    Returns (data_dict, list_of_benchmarks).
-    """
-    data = {}
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader)
-        benchmarks = header[1:]
-
-        for row in reader:
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(benchmarks):
-                data[model][bench] = float(row[i + 1])
-
-    return data, benchmarks
-
-
-def compute_metric(model_data: dict, factors: dict, benchmarks: list) -> float:
-    """Compute weighted sum of benchmark values using factors."""
-    total = 0.0
-    for bench in benchmarks:
-        value = model_data[bench]
-        factor = factors[bench]
-        total += value * factor
-    return total
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Compute baseline metrics using per-model factors."
-    )
-    parser.add_argument(
-        "--output",
-        default=None,
-        help="Output CSV path. Defaults to baseline_metrics.csv in results dir.",
-    )
-    args = parser.parse_args()
-
-    results_dir = get_results_dir()
-    factors = load_factors(FACTORS_PATH)
-
-    csv_path = os.path.join(results_dir, "aggregated_baseline.csv")
-    data, benchmarks = load_baseline_csv(csv_path)
-
-    # Compute metrics for each model
-    base_results = {}
-    instruct_results = {}
-
-    for csv_model in data:
-        factors_model = MODEL_NAME_MAPPING[csv_model]
-        model_factors = factors[factors_model]
-        metric = compute_metric(data[csv_model], model_factors, benchmarks)
-
-        if csv_model in BASE_MODELS:
-            base_results[csv_model] = metric
-        else:
-            instruct_results[csv_model] = metric
-
-    # Write output table
-    output_path = args.output or os.path.join(results_dir, "baseline_metrics.csv")
-
-    with open(output_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["model_type", "model", "metric"])
-
-        for model in sorted(base_results.keys()):
-            writer.writerow(["base", model, base_results[model]])
-
-        for model in sorted(instruct_results.keys()):
-            writer.writerow(["instruct", model, instruct_results[model]])
-
-    print(f"Written: {output_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/compute_baseline_metrics_by_benchmark.py b/scripts/compute_baseline_metrics_by_benchmark.py
deleted file mode 100755
index e726c836..00000000
--- a/scripts/compute_baseline_metrics_by_benchmark.py
+++ /dev/null
@@ -1,97 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compute metrics for baseline models (base and instruct-tuned) using factors from factors_by_benchmark.json.
-
-Reads aggregated_baseline.csv and computes averaged metrics for base and instruct groups.
-"""
-import os
-import csv
-import json
-import argparse
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json")
-
-BASE_MODELS = {"Qwen3-1.7B-Base", "Qwen3-4B-Base", "SmolLM3-3B-Base", "gemma-3-4b-pt"}
-INSTRUCT_MODELS = {"Qwen3-1.7B", "Qwen3-4B", "SmolLM3-3B", "gemma-3-4b-it"}
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def load_factors(factors_path: str) -> dict:
-    """Load factors from JSON file."""
-    with open(factors_path, "r") as f:
-        return json.load(f)
-
-
-def load_baseline_csv(csv_path: str) -> tuple[dict, list]:
-    """
-    Load baseline CSV file into a dict: {model: {benchmark: value}}.
-    Returns (data_dict, list_of_benchmarks).
-    """
-    data = {}
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader)
-        benchmarks = header[1:]
-
-        for row in reader:
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(benchmarks):
-                data[model][bench] = float(row[i + 1])
-
-    return data, benchmarks
-
-
-def compute_metric_by_benchmark(data: dict, factors: dict, benchmarks: list, models: set) -> float:
-    """
-    Compute weighted sum where each benchmark value is averaged across specified models.
-    """
-    total = 0.0
-    model_list = [m for m in data if m in models]
-    num_models = len(model_list)
-    for bench in benchmarks:
-        avg_value = sum(data[model][bench] for model in model_list) / num_models
-        factor = factors[bench]
-        total += avg_value * factor
-    return total
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Compute baseline metrics using benchmark factors."
-    )
-    parser.add_argument(
-        "--output",
-        default=None,
-        help="Output CSV path. Defaults to baseline_metrics_by_benchmark.csv in results dir.",
-    )
-    args = parser.parse_args()
-
-    results_dir = get_results_dir()
-    factors = load_factors(FACTORS_PATH)
-
-    csv_path = os.path.join(results_dir, "aggregated_baseline.csv")
-    data, benchmarks = load_baseline_csv(csv_path)
-
-    # Compute averaged metrics for each group
-    base_metric = compute_metric_by_benchmark(data, factors, benchmarks, BASE_MODELS)
-    instruct_metric = compute_metric_by_benchmark(data, factors, benchmarks, INSTRUCT_MODELS)
-
-    # Write output table
-    output_path = args.output or os.path.join(results_dir, "baseline_metrics_by_benchmark.csv")
-
-    with open(output_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["model_type", "metric"])
-        writer.writerow(["base", base_metric])
-        writer.writerow(["instruct", instruct_metric])
-
-    print(f"Written: {output_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/compute_single_metrics.py b/scripts/compute_single_metrics.py
deleted file mode 100755
index b485106a..00000000
--- a/scripts/compute_single_metrics.py
+++ /dev/null
@@ -1,135 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compute final metric for each final_*.csv table using factors from factors.json.
-
-For each benchmark, computes the average value across all models, then multiplies
-by the factor. Sums these to produce a single metric per method.
-"""
-import os
-import csv
-import json
-import argparse
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json")
-
-# Expected models in each complete CSV
-EXPECTED_MODELS = {
-    "Qwen3-1.7B-Base",
-    "Qwen3-4B-Base",
-    "SmolLM3-3B-Base",
-    "gemma-3-4b-pt",
-}
-
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def load_factors(factors_path: str) -> dict:
-    """Load factors from JSON file."""
-    with open(factors_path, "r") as f:
-        return json.load(f)
-
-
-def load_final_csv(csv_path: str, valid_benchmarks: set) -> tuple[dict, list]:
-    """
-    Load a final CSV file into a dict: {model: {benchmark: value}}.
-    Only loads benchmarks that are in valid_benchmarks.
-    Returns (data_dict, list_of_benchmarks).
-    """
-    data = {}
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader)
-        all_benchmarks = header[1:]
-        benchmarks = [b for b in all_benchmarks if b in valid_benchmarks]
-
-        for row in reader:
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(all_benchmarks):
-                if bench not in valid_benchmarks:
-                    continue
-                val = row[i + 1]
-                if val == "":
-                    continue
-                data[model][bench] = float(val)
-
-    return data, benchmarks
-
-
-def compute_metric(data: dict, factors: dict, benchmarks: list) -> float:
-    """
-    Compute weighted sum where each benchmark value is averaged across models.
-    """
-    total = 0.0
-    num_models = len(data)
-    for bench in benchmarks:
-        avg_value = sum(data[model][bench] for model in data) / num_models
-        factor = factors[bench]
-        total += avg_value * factor
-    return total
-
-
-def main():
-    parser = argparse.ArgumentParser(
-        description="Compute final metrics from final_*.csv files using benchmark factors."
-    )
-    parser.add_argument(
-        "--output",
-        default=None,
-        help="Output CSV path. Defaults to final_metrics.csv in results dir.",
-    )
-    args = parser.parse_args()
-
-    results_dir = get_results_dir()
-    factors = load_factors(FACTORS_PATH)
-    valid_benchmarks = set(factors.keys())
-
-    # Find all final_*.csv files, excluding final_time_* files
-    final_files = []
-    for filename in os.listdir(results_dir):
-        if not filename.startswith("final_"):
-            continue
-        if not filename.endswith(".csv"):
-            continue
-        if filename.startswith("final_time_"):
-            continue
-        # Check if file has all expected models
-        csv_path = os.path.join(results_dir, filename)
-        try:
-            data, _ = load_final_csv(csv_path, valid_benchmarks)
-        except Exception:
-            print(f"Warning: could not load {csv_path}.")
-            raise
-        if set(data.keys()) != EXPECTED_MODELS:
-            continue
-        final_files.append(filename)
-
-    final_files.sort()
-
-    # Compute metrics for each file
-    results = {}  # {method_name: metric}
-    for filename in final_files:
-        method_name = filename[len("final_") : -len(".csv")]
-        csv_path = os.path.join(results_dir, filename)
-        data, benchmarks = load_final_csv(csv_path, valid_benchmarks)
-        metric = compute_metric(data, factors, benchmarks)
-        results[method_name] = metric
-
-    # Write output table
-    output_path = args.output or os.path.join(results_dir, "final_metrics.csv")
-
-    with open(output_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["method", "metric"])
-
-        for method_name in sorted(results.keys()):
-            writer.writerow([method_name, results[method_name]])
-
-    print(f"Written: {output_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/compute_single_metrics_avg_stddev.py b/scripts/compute_single_metrics_avg_stddev.py
deleted file mode 100755
index 874284a7..00000000
--- a/scripts/compute_single_metrics_avg_stddev.py
+++ /dev/null
@@ -1,162 +0,0 @@
-#!/usr/bin/env python3
-"""
-Compute final metric for each final_*.csv table using factors from factors.json,
-then aggregate across multiple runs to produce averages and standard deviations.
-
-For each benchmark, computes the average value across all models, then multiplies
-by the factor. Sums these to produce a single metric per method.
-
-Then, for each agent group defined in HARDCODED_AGENT_MAP, computes the average
-and standard deviation of the metrics across runs.
-"""
-import os
-import csv
-import json
-import math
-
-from constants import HARDCODED_AGENT_MAP, HARDCODED_BENCHMARKS
-
-SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
-FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json")
-
-# Expected models in each complete CSV
-EXPECTED_MODELS = {
-    "Qwen3-1.7B-Base",
-    "Qwen3-4B-Base",
-    "SmolLM3-3B-Base",
-    "gemma-3-4b-pt",
-}
-
-def get_results_dir():
-    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
-
-
-def mean(values: list[float]) -> float:
-    return sum(values) / len(values)
-
-
-def stddev(values: list[float]) -> float:
-    avg = mean(values)
-    variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1)
-    return math.sqrt(variance)
-
-
-def load_factors(factors_path: str) -> dict:
-    """Load factors from JSON file."""
-    with open(factors_path, "r") as f:
-        return json.load(f)
-
-
-def load_final_csv(csv_path: str, valid_benchmarks: set) -> tuple[dict, list]:
-    """
-    Load a final CSV file into a dict: {model: {benchmark: value}}.
-    Only loads benchmarks that are in valid_benchmarks.
-    Returns (data_dict, list_of_benchmarks).
-    """
-    data = {}
-    with open(csv_path, "r", newline="") as f:
-        reader = csv.reader(f)
-        header = next(reader)
-        all_benchmarks = header[1:]
-        benchmarks = [b for b in all_benchmarks if b in valid_benchmarks]
-
-        for row in reader:
-            model = row[0]
-            data[model] = {}
-            for i, bench in enumerate(all_benchmarks):
-                if bench not in valid_benchmarks:
-                    continue
-                val = row[i + 1]
-                if val == "":
-                    continue
-                data[model][bench] = float(val)
-
-    return data, benchmarks
-
-
-def compute_metric(data: dict, factors: dict, benchmarks: list) -> float:
-    """
-    Compute weighted sum where each benchmark value is averaged across models.
-    """
-    total = 0.0
-    num_models = len(data)
-    for bench in benchmarks:
-        avg_value = sum(data[model][bench] for model in data) / num_models
-        factor = factors[bench]
-        total += avg_value * factor
-    return total
-
-
-def compute_all_metrics(results_dir: str, factors: dict) -> dict[str, float]:
-    """
-    Compute metrics for all final_*.csv files.
-    Returns {method_name: metric}.
-    """
-    valid_benchmarks = set(factors.keys())
-    results = {}
-
-    for filename in os.listdir(results_dir):
-        if not filename.startswith("final_"):
-            continue
-        if not filename.endswith(".csv"):
-            continue
-        if filename.startswith("final_time_"):
-            continue
-
-        csv_path = os.path.join(results_dir, filename)
-        data, benchmarks = load_final_csv(csv_path, valid_benchmarks)
-
-        if set(data.keys()) != EXPECTED_MODELS:
-            continue
-
-        method_name = filename[len("final_") : -len(".csv")]
-        metric = compute_metric(data, factors, benchmarks)
-        results[method_name] = metric
-
-    return results
-
-
-def main():
-    results_dir = get_results_dir()
-    factors = load_factors(FACTORS_PATH)
-
-    # Compute metrics for all methods
-    all_metrics = compute_all_metrics(results_dir, factors)
-
-    # Write individual metrics CSV
-    metrics_path = os.path.join(results_dir, "single_metrics.csv")
-    with open(metrics_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["method", "metric"])
-        for method_name in sorted(all_metrics.keys()):
-            writer.writerow([method_name, all_metrics[method_name]])
-    print(f"Written: {metrics_path}")
-
-    # Compute aggregated metrics for each agent group
-    aggregated_results = {}
-
-    for agent_name, method_names in HARDCODED_AGENT_MAP.items():
-        metrics = []
-        for method_name in method_names:
-            metric = all_metrics[method_name]
-            metrics.append(metric)
-
-        aggregated_results[agent_name] = {
-            "avg": mean(metrics),
-            "std": stddev(metrics),
-            "n": len(metrics),
-        }
-
-    # Write aggregated metrics CSV
-    aggregated_path = os.path.join(results_dir, "single_metrics_aggregated.csv")
-    with open(aggregated_path, "w", newline="") as f:
-        writer = csv.writer(f)
-        writer.writerow(["agent", "avg", "std", "n"])
-        for agent_name in sorted(aggregated_results.keys()):
-            data = aggregated_results[agent_name]
-            writer.writerow([agent_name, data["avg"], data["std"], data["n"]])
-    print(f"Written: {aggregated_path}")
-
-
-if __name__ == "__main__":
-    main()
diff --git a/scripts/constants.py b/scripts/constants.py
index c1dbd615..c45bcefa 100644
--- a/scripts/constants.py
+++ b/scripts/constants.py
@@ -62,7 +62,13 @@
     "claude_non_api_claude-opus-4-6_1m__10h_run1",
     "claude_non_api_claude-opus-4-6_1m__10h_run2",
     "claude_non_api_claude-opus-4-6_1m__10h_run3"
-    ]
+    ],
+
+    "Opus-4.7":[
+    "claude_non_api_claude-opus-4-7_10h",
+    "claude_non_api_claude-opus-4-7_10h_run2",
+    "claude_non_api_claude-opus-4-7_10h_run3"
+    ] 
 
 }
 
diff --git a/scripts/rerun_eval_n_times.sh b/scripts/rerun_eval_n_times.sh
new file mode 100755
index 00000000..c3b89263
--- /dev/null
+++ b/scripts/rerun_eval_n_times.sh
@@ -0,0 +1,152 @@
+#!/bin/bash
+# Re-run the per-task evaluate.py N times on an already-finished EVAL_DIR
+# and aggregate per-run metrics into <EVAL_DIR>/metrics_averaged.json.
+#
+# Usage:
+#   scripts/rerun_eval_n_times.sh <EVAL_DIR> [N]
+#
+# Defaults: N=5.
+#
+# Mirrors run_task.sh's evaluation step: runs src/eval/tasks/<task>/evaluate.py
+# (NOT the snapshot in $EVAL_DIR/task) under the same vllm_debug container with
+# the same fuse-overlayfs HF cache and the same max-tokens fallback ladder.
+#
+# Run from the repo root, on a node with GPUs (submit via
+# src/commit_utils/rerun_eval.sub for cluster execution).
+set -euo pipefail
+
+if [ "$#" -lt 1 ]; then
+    echo "usage: $0 <EVAL_DIR> [N]" >&2
+    exit 1
+fi
+
+EVAL_DIR="$(realpath "$1")"
+N="${2:-5}"
+
+if [ ! -d "$EVAL_DIR/final_model" ]; then
+    echo "ERROR: $EVAL_DIR/final_model not found" >&2
+    exit 1
+fi
+
+source src/commit_utils/set_env_vars.sh
+
+# Derive the task name from the EVAL_DIR basename: <task>_<model_safe>_<cluster_id>.
+EVAL_BASENAME="$(basename "$EVAL_DIR")"
+EVALUATION_TASK="${EVAL_BASENAME%%_*}"
+
+if [ ! -f "src/eval/tasks/${EVALUATION_TASK}/evaluate.py" ]; then
+    echo "ERROR: src/eval/tasks/${EVALUATION_TASK}/evaluate.py not found" >&2
+    echo "       (parsed task '${EVALUATION_TASK}' from $(basename "$EVAL_DIR"))" >&2
+    exit 1
+fi
+
+REPO_ROOT="$(pwd)"
+RERUNS_DIR="$EVAL_DIR/reruns"
+mkdir -p "$RERUNS_DIR"
+
+# Per-task max-tokens fallback ladder, mirroring run_task.sh.
+case "$EVALUATION_TASK" in
+    aime2025)         FB1="--max-tokens 12000";    FB2="--max-tokens 8000" ;;
+    arenahardwriting) FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;;
+    bfcl)             FB1="--max-tokens 12000";    FB2="--max-tokens 8000" ;;
+    gpqamain)         FB1="--max-tokens 12000";    FB2="--max-tokens 8000" ;;
+    gsm8k)            FB1="--max-tokens 3000";     FB2="--max-tokens 2000" ;;
+    healthbench)      FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;;
+    humaneval)        FB1="--max-tokens 3000";     FB2="--max-tokens 2000" ;;
+    *)                FB1="";                       FB2="" ;;
+esac
+
+# Fuse-overlayfs HF cache so reruns don't pollute the shared HF cache,
+# matching run_task.sh's with_huggingface_overlay helper.
+TMP_SUBDIR="/tmp/rerun_eval_$$"
+HF_MERGED="${TMP_SUBDIR}/merged_huggingface"
+TMP_HF_CACHE="/tmp/hf_cache_rerun_$$"
+
+setup_overlay() {
+    mkdir -p "${TMP_SUBDIR}/upper_huggingface"
+    mkdir -p "${TMP_SUBDIR}/fuse_workdir"
+    mkdir -p "${HF_MERGED}"
+    fuse-overlayfs -o \
+        "lowerdir=${HF_HOME},upperdir=${TMP_SUBDIR}/upper_huggingface,workdir=${TMP_SUBDIR}/fuse_workdir" \
+        "${HF_MERGED}"
+}
+
+teardown_overlay() {
+    fusermount -u "${HF_MERGED}" 2>/dev/null || true
+    rm -rf "${TMP_SUBDIR}" 2>/dev/null || true
+}
+trap teardown_overlay EXIT
+
+setup_overlay
+
+run_one() {
+    local out_json="$1"
+    local extra="$2"
+    local log="$3"
+
+    nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null \
+        | xargs -r kill -9 2>/dev/null || true
+    sleep 5
+
+    timeout --signal=TERM --kill-after=60s 28800s \
+    apptainer exec \
+        --nv \
+        --env "HF_HOME=${TMP_HF_CACHE}" \
+        --env OPENAI_API_KEY="${OPENAI_API_KEY:-}" \
+        --env VLLM_API_KEY="inspectai" \
+        --env PYTHONNOUSERSITE="1" \
+        --writable-tmpfs \
+        --bind "${REPO_ROOT}:${REPO_ROOT}" \
+        --bind "${HF_MERGED}:${TMP_HF_CACHE}" \
+        --pwd "${REPO_ROOT}/src/eval/tasks/${EVALUATION_TASK}" \
+        "${POST_TRAIN_BENCH_CONTAINERS_DIR}/${POST_TRAIN_BENCH_CONTAINER_NAME}.sif" \
+        python evaluate.py \
+            --model-path "${EVAL_DIR}/final_model" \
+            --templates-dir ../../../../src/eval/templates \
+            --limit -1 \
+            ${extra} \
+            --json-output-file "${out_json}" >"${log}" 2>&1
+}
+
+run_with_fallback() {
+    local out_json="$1"
+    local log_prefix="$2"
+
+    rm -f "$out_json"
+
+    for level in default fb1 fb2; do
+        local extra=""
+        case "$level" in
+            default) extra="" ;;
+            fb1)     extra="$FB1" ;;
+            fb2)     extra="$FB2" ;;
+        esac
+        echo "  [$level] extra='${extra}'"
+        run_one "$out_json" "$extra" "${log_prefix}_${level}.log" || true
+        if [ -f "$out_json" ]; then
+            return 0
+        fi
+    done
+    return 1
+}
+
+echo "EVAL_DIR=${EVAL_DIR}"
+echo "EVALUATION_TASK=${EVALUATION_TASK}"
+echo "N=${N}"
+
+for i in $(seq 1 "$N"); do
+    out="${RERUNS_DIR}/run_${i}.json"
+    log_prefix="${RERUNS_DIR}/run_${i}"
+    echo "=== rerun ${i} / ${N} ==="
+    if run_with_fallback "$out" "$log_prefix"; then
+        echo "  -> wrote $out"
+    else
+        echo "  -> FAILED all fallbacks for rerun ${i}"
+    fi
+done
+
+python scripts/aggregate_metrics_runs.py \
+    --runs-glob "${RERUNS_DIR}/run_*.json" \
+    --output "${EVAL_DIR}/metrics_averaged.json"
+
+echo "Wrote ${EVAL_DIR}/metrics_averaged.json"
diff --git a/scripts/utils.py b/scripts/utils.py
new file mode 100644
index 00000000..ab6af6d8
--- /dev/null
+++ b/scripts/utils.py
@@ -0,0 +1,410 @@
+#!/usr/bin/env python3
+"""Shared constants and utility functions for aggregation scripts."""
+import csv
+import json
+import math
+import os
+import re
+
+
+# ---------------------------------------------------------------------------
+# Constants
+# ---------------------------------------------------------------------------
+
+SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__))
+FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json")
+BASELINES_PATH = os.path.join(SCRIPT_DIR, "baselines.json")
+
+HARDCODED_AGENT_MAP = {
+    "Opus-4.5": [
+        "claude_claude-opus-4-5_10h_final_v3",
+        "claude_claude-opus-4-5_10h_v5",
+        "claude_claude-opus-4-5_10h_v6_seed1",
+    ],
+    "GPT-5.1-Codex-Max": [
+        "codex_gpt-5.1-codex-max_10h_final_v3",
+        "codex_gpt-5.1-codex-max_10h_v4_seed1",
+        "codex_gpt-5.1-codex-max_10h_v4_seed2",
+    ],
+    "GPT-5.2-Codex": [
+        "codex_gpt-5.2-codex_10h_v6",
+        "codex_gpt-5.2-codex_10h_v6_seed1",
+        "codex_gpt-5.2-codex_10h_v6_seed2",
+    ],
+    "GPT-5.2": [
+        "codex_gpt-5.2_10h_v4",
+        "codex_gpt-5.2_10h_v6_seed1",
+        "codex_gpt-5.2_10h_v6_seed2",
+    ],
+    "Gemini-3-Pro": [
+        "gemini_models_gemini-3-pro-preview_10h_final_v3",
+        "gemini_models_gemini-3-pro-preview_10h_v5",
+        "gemini_models_gemini-3-pro-preview_10h_v6_seed1",
+    ],
+    "GPT-5.1-Codex-Max Low": [
+        "codexlow_gpt-5.1-codex-max_10h_v7",
+        "codexlow_gpt-5.1-codex-max_10h_v7_seed1",
+    ],
+    "GPT-5.1-Codex-Max High": [
+        "codexhigh_gpt-5.1-codex-max_10h_v7",
+        "codexhigh_gpt-5.1-codex-max_10h_v7_seed1",
+    ],
+    "Opus-4.6": [
+        "claude_claude-opus-4-6_10h_run1_old_container",
+        "claude_claude-opus-4-6_10h_run2",
+        "claude_claude-opus-4-6_10h_run3",
+    ],
+    "GPT-5.3-Codex_Med": [
+        "codex_non_api_gpt-5.3-codex_10h_run1",
+        "codex_non_api_gpt-5.3-codex_10h_run2",
+        "codex_non_api_gpt-5.3-codex_10h_run3",
+    ],
+    "Gemini-3.1-Pro": [
+        "opencode_opencode_gemini-3.1-pro_10h_run1",
+        "opencode_opencode_gemini-3.1-pro_10h_run2",
+        "opencode_opencode_gemini-3.1-pro_10h_run3",
+    ],
+    "GPT-5.3-Codex_High": [
+        "codex_non_api_high_gpt-5.3-codex_10h_run1",
+        "codex_non_api_high_gpt-5.3-codex_10h_run2",
+        "codex_non_api_high_gpt-5.3-codex_10h_run3",
+    ],
+    "GPT-5.4-High": [
+        "codex_non_api_high_gpt-5.4_10h_run1",
+        "codex_non_api_high_gpt-5.4_10h_run2",
+        "codex_non_api_high_gpt-5.4_10h_run3",
+    ],
+    "Opus-4.6-1M": [
+        "claude_non_api_claude-opus-4-6_1m__10h_run1",
+        "claude_non_api_claude-opus-4-6_1m__10h_run2",
+        "claude_non_api_claude-opus-4-6_1m__10h_run3",
+    ],
+    "Opus-4.7":[
+    "claude_non_api_claude-opus-4-7_10h",
+    "claude_non_api_claude-opus-4-7_10h_run2",
+    "claude_non_api_claude-opus-4-7_10h_run3",
+    ],
+    "GPT-5.5-xHigh":[
+    "codex_non_api_xhigh_gpt-5.5_10h_run1",
+    "codex_non_api_xhigh_gpt-5.5_10h_run2",
+
+    ]
+}
+
+HARDCODED_BENCHMARKS = [
+    "aime2025",
+    "arenahardwriting",
+    "bfcl",
+    "gpqamain",
+    "gsm8k",
+    "healthbench",
+    "humaneval",
+]
+
+EXPECTED_MODELS = {
+    "Qwen3-1.7B-Base",
+    "Qwen3-4B-Base",
+    "SmolLM3-3B-Base",
+    "gemma-3-4b-pt",
+}
+
+BUDGET_SECONDS = 10 * 3600  # 10 hours
+
+
+def load_factors() -> dict:
+    with open(FACTORS_PATH, "r") as f:
+        return json.load(f)
+
+
+def load_baselines() -> dict:
+    """Load hardcoded baseline data from baselines.json.
+
+    Returns {"zeroshot": {model: {bench: value}}, "fewshot": {...}}.
+    Values are floats.
+    """
+    with open(BASELINES_PATH, "r") as f:
+        return json.load(f)
+
+
+def get_baseline_fallback_data() -> dict[str, dict[str, str]]:
+    """Load zeroshot baselines as {model: {bench: str_value}} for fallback.
+
+    This is the replacement for reading aggregated_baseline_zeroshot.csv.
+    """
+    baselines = load_baselines()
+    data = {}
+    for model, benchmarks in baselines["zeroshot"].items():
+        data[model] = {bench: str(val) for bench, val in benchmarks.items()}
+    return data
+
+
+# ---------------------------------------------------------------------------
+# Stats
+# ---------------------------------------------------------------------------
+
+def mean(values: list[float]) -> float:
+    return sum(values) / len(values)
+
+
+def stddev(values: list[float]) -> float:
+    avg = mean(values)
+    variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1)
+    return math.sqrt(variance)
+
+
+# ---------------------------------------------------------------------------
+# Paths
+# ---------------------------------------------------------------------------
+
+def get_results_dir() -> str:
+    return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results")
+
+
+# ---------------------------------------------------------------------------
+# CSV I/O
+# ---------------------------------------------------------------------------
+
+def is_number(value: str) -> bool:
+    if not value:
+        return False
+    try:
+        float(value)
+        return True
+    except ValueError:
+        return False
+
+
+def load_csv_as_dict(csv_path: str) -> tuple[dict[str, dict[str, str]], list[str]]:
+    """
+    Load a CSV into {model: {benchmark: value}}.
+    Returns (data, benchmarks). Returns ({}, []) if file doesn't exist.
+    """
+    data = {}
+    benchmarks = []
+
+    if not os.path.exists(csv_path):
+        return data, benchmarks
+
+    with open(csv_path, "r", newline="") as f:
+        reader = csv.reader(f)
+        header = next(reader, None)
+        if not header:
+            return data, benchmarks
+
+        benchmarks = header[1:]
+
+        for row in reader:
+            if not row:
+                continue
+            model = row[0]
+            data[model] = {}
+            for i, bench in enumerate(benchmarks):
+                if i + 1 < len(row):
+                    data[model][bench] = row[i + 1]
+                else:
+                    data[model][bench] = ""
+
+    return data, benchmarks
+
+
+def write_csv(
+    path: str,
+    models: list[str],
+    benchmarks: list[str],
+    data: dict[str, dict[str, str]],
+):
+    with open(path, "w", newline="") as f:
+        writer = csv.writer(f)
+        writer.writerow(["model"] + benchmarks)
+        for model in models:
+            row = [model]
+            for bench in benchmarks:
+                row.append(data[model].get(bench, ""))
+            writer.writerow(row)
+
+
+# ---------------------------------------------------------------------------
+# Walking result directories
+# ---------------------------------------------------------------------------
+
+def walk_latest_runs(
+    method_path: str,
+    min_run_id: int | None = None,
+    max_run_id: int | None = None,
+) -> dict[tuple[str, str], dict]:
+    """
+    Walk a method directory and return the latest run per (benchmark, model).
+
+    Returns {(benchmark, model): {"run_id": int, "path": str}}.
+    """
+    latest_runs = {}
+
+    for entry in os.listdir(method_path):
+        entry_path = os.path.join(method_path, entry)
+        if not os.path.isdir(entry_path):
+            continue
+
+        try:
+            benchmark, _, model, run_id_str = entry.split("_")
+            run_id = int(run_id_str)
+        except ValueError:
+            print(entry)
+            raise ValueError(f"{entry}, {method_path}")
+
+        if max_run_id is not None and run_id >= max_run_id:
+            continue
+        if min_run_id is not None and run_id < min_run_id:
+            continue
+
+        key = (benchmark, model)
+        if key not in latest_runs or run_id > latest_runs[key]["run_id"]:
+            latest_runs[key] = {"run_id": run_id, "path": entry_path}
+
+    return latest_runs
+
+
+# ---------------------------------------------------------------------------
+# Metrics loading
+# ---------------------------------------------------------------------------
+
+def load_metrics(metrics_path: str, method_name: str = None) -> str:
+    """
+    Return the accuracy as a string, or an error label.
+
+    Error labels for non-baseline methods:
+      - "not avl."   if time_taken.txt doesn't exist
+      - "not stored" if time_taken.txt exists but final_model/ doesn't
+      - "ERR"        otherwise
+    For baseline: always "ERR" on failure.
+    """
+    if os.path.exists(metrics_path):
+        try:
+            with open(metrics_path, "r") as f:
+                data = json.load(f)
+            acc = data.get("accuracy")
+            if acc is not None:
+                return str(acc)
+        except Exception:
+            pass
+
+    if method_name == "baseline_zeroshot":
+        return "ERR"
+
+    run_dir = os.path.dirname(metrics_path)
+
+    if not os.path.exists(os.path.join(run_dir, "time_taken.txt")):
+        return "not avl."
+
+    if not os.path.isdir(os.path.join(run_dir, "final_model")):
+        return "not stored"
+
+    return "ERR"
+
+
+# ---------------------------------------------------------------------------
+# Contamination loading
+# ---------------------------------------------------------------------------
+
+def load_contamination(contamination_path: str):
+    """Return True, False, "IMPORTANT ERR", or "ERR"."""
+    if not os.path.exists(contamination_path):
+        return "ERR"
+    try:
+        with open(contamination_path, "r") as f:
+            content = f.read().strip()
+    except Exception:
+        return "ERR"
+    if content == "contamination detected":
+        return True
+    elif content == "no contamination detected":
+        return False
+    else:
+        return "IMPORTANT ERR"
+
+
+def load_disallowed_model(disallowed_path: str):
+    """Return True, False, "IMPORTANT ERR", or "ERR"."""
+    if not os.path.exists(disallowed_path):
+        return "ERR"
+    try:
+        with open(disallowed_path, "r") as f:
+            content = f.read().strip()
+    except Exception:
+        return "ERR"
+    if content == "disallowed use detected":
+        return True
+    elif content == "only allowed use detected":
+        return False
+    else:
+        return "IMPORTANT ERR"
+
+
+def combine_contamination_results(contamination, disallowed_model) -> str:
+    """
+    Combine contamination and disallowed model results into a cell value.
+
+    Returns "" (clean), "C", "M", "MC", or an error string.
+    """
+    if contamination in ("ERR", "IMPORTANT ERR") or disallowed_model in (
+        "ERR",
+        "IMPORTANT ERR",
+    ):
+        errors = []
+        if contamination in ("ERR", "IMPORTANT ERR"):
+            errors.append(f"C:{contamination}")
+        if disallowed_model in ("ERR", "IMPORTANT ERR"):
+            errors.append(f"M:{disallowed_model}")
+        return " ".join(errors)
+
+    if disallowed_model and contamination:
+        return "MC"
+    elif disallowed_model and not contamination:
+        return "M"
+    elif not disallowed_model and contamination:
+        return "C"
+    else:
+        return ""
+
+
+# ---------------------------------------------------------------------------
+# Time loading
+# ---------------------------------------------------------------------------
+
+def parse_time_hms(time_str: str) -> int | None:
+    """Parse H:M:S string to total seconds. Returns None on failure."""
+    match = re.match(r"^(\d+):(\d{1,2}):(\d{1,2})$", time_str.strip())
+    if not match:
+        return None
+    hours, minutes, seconds = map(int, match.groups())
+    if minutes >= 60 or seconds >= 60:
+        return None
+    return hours * 3600 + minutes * 60 + seconds
+
+
+def format_time_hms(total_seconds: int) -> str:
+    """Convert total seconds to H:MM:SS format."""
+    hours = total_seconds // 3600
+    minutes = (total_seconds % 3600) // 60
+    seconds = total_seconds % 60
+    return f"{hours}:{minutes:02d}:{seconds:02d}"
+
+
+def load_time_taken(run_dir: str) -> tuple[str, int | None]:
+    """
+    Return (display_string, total_seconds).
+    Returns ("ERR", None) on failure.
+    """
+    time_taken_path = os.path.join(run_dir, "time_taken.txt")
+
+    if not os.path.exists(time_taken_path):
+        return "ERR", None
+
+    try:
+        with open(time_taken_path, "r") as f:
+            time_str = f.read().strip()
+        total_seconds = parse_time_hms(time_str)
+        if total_seconds is None:
+            return "ERR", None
+        return format_time_hms(total_seconds), total_seconds
+    except Exception:
+        return "ERR", None
diff --git a/scripts/verify.py b/scripts/verify.py
new file mode 100644
index 00000000..857703a6
--- /dev/null
+++ b/scripts/verify.py
@@ -0,0 +1,261 @@
+#!/usr/bin/env python3
+"""
+Verify that refactored aggregation scripts produce identical outputs
+to the original pipeline.
+
+Usage:
+    python verify.py --ground-truth /fast/hbhatnagar/ptb_results/ \
+                     --new-output /fast/hbhatnagar/ptb_results_new/
+
+Compares all key output CSVs cell-by-cell:
+  - final_{method}.csv          (per-method score grids)
+  - contamination_{method}.csv  (per-method contamination flags)
+  - single_metrics.csv          (weighted score per run)
+  - single_metrics_aggregated.csv (avg/std per agent group)
+  - aggregated_avg_{agent}.csv  (per-cell avg for multi-run agents)
+  - aggregated_std_{agent}.csv  (per-cell std for multi-run agents)
+  - time_aggregated.csv         (avg/std time per agent)
+"""
+import argparse
+import csv
+import os
+import sys
+
+
+FLOAT_TOLERANCE = 1e-10
+
+
+def is_number(s: str) -> bool:
+    if not s:
+        return False
+    try:
+        float(s)
+        return True
+    except ValueError:
+        return False
+
+
+def load_csv(path: str) -> list[list[str]]:
+    with open(path, "r", newline="") as f:
+        return list(csv.reader(f))
+
+
+def compare_csvs(gt_path: str, new_path: str) -> list[str]:
+    """
+    Compare two CSVs cell-by-cell.
+    Returns list of mismatch descriptions (empty = pass).
+    """
+    errors = []
+
+    gt_rows = load_csv(gt_path)
+    new_rows = load_csv(new_path)
+
+    if len(gt_rows) != len(new_rows):
+        errors.append(f"Row count differs: {len(gt_rows)} vs {len(new_rows)}")
+        # Still compare what we can
+        max_rows = min(len(gt_rows), len(new_rows))
+    else:
+        max_rows = len(gt_rows)
+
+    for row_idx in range(max_rows):
+        gt_row = gt_rows[row_idx]
+        new_row = new_rows[row_idx]
+
+        if len(gt_row) != len(new_row):
+            errors.append(
+                f"  Row {row_idx}: column count differs: "
+                f"{len(gt_row)} vs {len(new_row)}"
+            )
+            max_cols = min(len(gt_row), len(new_row))
+        else:
+            max_cols = len(gt_row)
+
+        for col_idx in range(max_cols):
+            gt_val = gt_row[col_idx]
+            new_val = new_row[col_idx]
+
+            if gt_val == new_val:
+                continue
+
+            # Try numeric comparison with tolerance
+            if is_number(gt_val) and is_number(new_val):
+                if abs(float(gt_val) - float(new_val)) < FLOAT_TOLERANCE:
+                    continue
+
+            # Header row for context
+            header_label = ""
+            if row_idx > 0 and gt_rows[0]:
+                col_name = gt_rows[0][col_idx] if col_idx < len(gt_rows[0]) else "?"
+                row_name = gt_row[0] if gt_row else "?"
+                header_label = f" ({row_name}, {col_name})"
+
+            errors.append(
+                f"  Row {row_idx}, Col {col_idx}{header_label}: "
+                f"'{gt_val}' vs '{new_val}'"
+            )
+
+    return errors
+
+
+def find_matching_files(gt_dir: str, new_dir: str) -> dict[str, tuple[str, str]]:
+    """
+    Find CSVs that exist in both directories, filtered to the ones we care about.
+    Returns {filename: (gt_path, new_path)}.
+    """
+    matches = {}
+
+    gt_files = set(f for f in os.listdir(gt_dir) if f.endswith(".csv"))
+    new_files = set(f for f in os.listdir(new_dir) if f.endswith(".csv"))
+
+    # Files we care about
+    for f in sorted(gt_files & new_files):
+        if should_verify(f):
+            matches[f] = (os.path.join(gt_dir, f), os.path.join(new_dir, f))
+
+    return matches
+
+
+def should_verify(filename: str) -> bool:
+    """Decide if a CSV file should be verified."""
+    # Skip deprecated / intermediate / artifact files
+    if filename in (
+        "aggregated_avg_over_models.csv",
+        "aggregated_std_over_models.csv",
+    ):
+        return False
+
+    # Skip intermediate time CSVs (only time_aggregated.csv is a final output)
+    if filename.startswith("aggregated_time_"):
+        return False
+
+    # Per-method final scores
+    if filename.startswith("final_") and filename.endswith(".csv"):
+        # Skip deprecated/artifact files
+        if filename.startswith("final_avg_"):
+            return False
+        if filename.startswith("final_std_"):
+            return False
+        if filename.startswith("final_time_"):
+            return False
+        # Skip baselines (hardcoded in baselines.json, not regenerated)
+        if filename in ("final_baseline.csv", "final_baseline_zeroshot.csv"):
+            return False
+        return True
+
+    # Contamination flags
+    if filename.startswith("contamination_") and filename.endswith(".csv"):
+        # Skip baselines
+        if filename in (
+            "contamination_baseline.csv",
+            "contamination_baseline_zeroshot.csv",
+        ):
+            return False
+        return True
+
+    # Single metric outputs
+    if filename in ("single_metrics.csv", "single_metrics_aggregated.csv"):
+        return True
+
+    # Per-agent avg/std (multi-run agents)
+    if filename.startswith("aggregated_avg_") or filename.startswith("aggregated_std_"):
+        return True
+
+    # Time aggregation
+    if filename == "time_aggregated.csv":
+        return True
+
+    return False
+
+
+def main():
+    parser = argparse.ArgumentParser(
+        description="Verify refactored aggregation outputs match ground truth."
+    )
+    parser.add_argument(
+        "--ground-truth",
+        required=True,
+        help="Directory with ground truth CSV outputs (from original scripts).",
+    )
+    parser.add_argument(
+        "--new-output",
+        required=True,
+        help="Directory with new CSV outputs (from refactored scripts).",
+    )
+    args = parser.parse_args()
+
+    gt_dir = args.ground_truth
+    new_dir = args.new_output
+
+    if not os.path.isdir(gt_dir):
+        print(f"Error: ground truth dir not found: {gt_dir}")
+        sys.exit(1)
+    if not os.path.isdir(new_dir):
+        print(f"Error: new output dir not found: {new_dir}")
+        sys.exit(1)
+
+    matches = find_matching_files(gt_dir, new_dir)
+
+    if not matches:
+        print("No matching CSV files found to compare.")
+        sys.exit(1)
+
+    # Check for files in ground truth that are missing from new output
+    gt_verifiable = set(
+        f for f in os.listdir(gt_dir) if f.endswith(".csv") and should_verify(f)
+    )
+    new_verifiable = set(
+        f for f in os.listdir(new_dir) if f.endswith(".csv") and should_verify(f)
+    )
+
+    missing_from_new = gt_verifiable - new_verifiable
+    extra_in_new = new_verifiable - gt_verifiable
+
+    total_files = len(matches)
+    passed = 0
+    failed = 0
+    failure_details = []
+
+    print(f"Comparing {total_files} CSV files...\n")
+
+    for filename, (gt_path, new_path) in sorted(matches.items()):
+        errors = compare_csvs(gt_path, new_path)
+        if errors:
+            failed += 1
+            failure_details.append((filename, errors))
+            print(f"  FAIL  {filename}")
+        else:
+            passed += 1
+            print(f"  PASS  {filename}")
+
+    # Summary
+    print(f"\n{'='*60}")
+    print(f"Results: {passed} passed, {failed} failed, {total_files} total")
+
+    if missing_from_new:
+        print(f"\nMISSING from new output ({len(missing_from_new)}):")
+        for f in sorted(missing_from_new):
+            print(f"  - {f}")
+
+    if extra_in_new:
+        print(f"\nEXTRA in new output ({len(extra_in_new)}):")
+        for f in sorted(extra_in_new):
+            print(f"  + {f}")
+
+    if failure_details:
+        print(f"\nFailure details:")
+        for filename, errors in failure_details:
+            print(f"\n  {filename}:")
+            for err in errors[:10]:  # Cap at 10 errors per file
+                print(f"    {err}")
+            if len(errors) > 10:
+                print(f"    ... and {len(errors) - 10} more")
+
+    if failed or missing_from_new:
+        sys.exit(1)
+    else:
+        print("\nAll checks passed.")
+        sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/src/commit_utils/commit.sh b/src/commit_utils/commit.sh
index 34abd374..3c43144e 100644
--- a/src/commit_utils/commit.sh
+++ b/src/commit_utils/commit.sh
@@ -2,71 +2,83 @@
 source src/commit_utils/set_env_vars.sh
 
 models=(
-    "google/gemma-3-4b-pt"
+    # "google/gemma-3-4b-pt"
     "Qwen/Qwen3-4B-Base"
-    "Qwen/Qwen3-1.7B-Base"
-    "HuggingFaceTB/SmolLM3-3B-Base"
+    # "Qwen/Qwen3-1.7B-Base"
+    # "HuggingFaceTB/SmolLM3-3B-Base"
 )
 
 evals=(
-    "aime2025"
-    "arenahardwriting"
-    "bfcl"
-    "gpqamain"
-    "gsm8k"
-    "humaneval"
+    # "aime2025"
+    # "arenahardwriting"
+    # "bfcl"
+    # "gpqamain"
+    # "gsm8k"
+    # "humaneval"
     "healthbench"
 )
-# export POST_TRAIN_BENCH_EXPERIMENT_NAME="_pushed"
+export POST_TRAIN_BENCH_EXPERIMENT_NAME="_METR"
 for model in "${models[@]}"; do
     for eval in "${evals[@]}"; do
         echo ""
         echo $model on $eval
         if [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor_mpi-is" ]; then
             # Proprietary (API)
-            condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
             # Proprietary (Subscription plan)
-            condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-7" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            
 
             # Multi-GPU runs might need more than 8 CPUs and 128 GB of RAM (use 512 GB to be safe)
-            condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" src/commit_utils/single_task.sub   
-            condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub   
+            condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub   
+            # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub   
+            # condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub
 
-            # Reprompted variant to push the agent (such as GPT 5.4)
-            condor_submit_bid 100 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # # Reprompted variant to push the agent (such as GPT 5.4)
+            # condor_submit_bid 50 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=1" -a "num_hours=5" src/commit_utils/single_task.sub   
 
-            condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+
+
+            # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
+           
             # OpenCode 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
-            sleep 10
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub 
+            # sleep 10
         elif [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor" ]; then
             condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
             condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub
diff --git a/src/commit_utils/rerun_eval.sub b/src/commit_utils/rerun_eval.sub
new file mode 100644
index 00000000..62aa4a17
--- /dev/null
+++ b/src/commit_utils/rerun_eval.sub
@@ -0,0 +1,15 @@
+executable = /bin/bash
+num_gpus = 1
+n = 5
+arguments = scripts/rerun_eval_n_times.sh $(eval_dir) $(n)
+environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) HF_HOME=$ENV(HF_HOME)"
+error = rerun_$(Cluster).err
+output = rerun_$(Cluster).out
+log = rerun_$(Cluster).log
+request_memory = 131072
+request_cpus = 16
+request_gpus = $(num_gpus)
+requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" && Machine != "i104.internal.cluster.is.localnet"
+request_disk=400G
++BypassLXCfs="true"
+queue