diff --git a/agents/claude_reprompt/human_readable_trace.py b/agents/claude_reprompt/human_readable_trace.py new file mode 120000 index 00000000..d643db01 --- /dev/null +++ b/agents/claude_reprompt/human_readable_trace.py @@ -0,0 +1 @@ +../claude/human_readable_trace.py \ No newline at end of file diff --git a/agents/claude_reprompt/solve.sh b/agents/claude_reprompt/solve.sh new file mode 100755 index 00000000..b0b25a4c --- /dev/null +++ b/agents/claude_reprompt/solve.sh @@ -0,0 +1,31 @@ +#!/bin/bash +unset GEMINI_API_KEY +unset CODEX_API_KEY + +export BASH_MAX_TIMEOUT_MS="36000000" + +MIN_REMAINING_MINUTES=30 + +claude --print --verbose --model "$AGENT_CONFIG" --output-format stream-json \ + --dangerously-skip-permissions "$PROMPT" + +# Re-prompt loop: if the agent finishes early, resume the session +while true; do + TIMER_OUTPUT=$(bash timer.sh 2>/dev/null) + if echo "$TIMER_OUTPUT" | grep -q "expired"; then + break + fi + + REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)') + REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+') + TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS )) + + if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then + break + fi + + CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance." + + claude --print --verbose --continue --model "$AGENT_CONFIG" --output-format stream-json \ + --dangerously-skip-permissions "$CONTINUATION_PROMPT" +done diff --git a/agents/codex_xhigh/human_readable_trace.py b/agents/codex_xhigh/human_readable_trace.py new file mode 120000 index 00000000..9cf1a5d9 --- /dev/null +++ b/agents/codex_xhigh/human_readable_trace.py @@ -0,0 +1 @@ +../codex/human_readable_trace.py \ No newline at end of file diff --git a/agents/codex_xhigh/solve.sh b/agents/codex_xhigh/solve.sh new file mode 100755 index 00000000..443f1c5a --- /dev/null +++ b/agents/codex_xhigh/solve.sh @@ -0,0 +1,12 @@ +#!/bin/bash +unset ANTHROPIC_API_KEY +unset GEMINI_API_KEY + +# Set reasoning effort to xhigh (prepend to ensure precedence) +file=/home/ben/.codex/config.toml +tmp="$(mktemp)" +printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp" +[ -f "$file" ] && cat "$file" >> "$tmp" +mv "$tmp" "$file" + +codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT" diff --git a/agents/codex_xhigh_reprompt/human_readable_trace.py b/agents/codex_xhigh_reprompt/human_readable_trace.py new file mode 120000 index 00000000..9cf1a5d9 --- /dev/null +++ b/agents/codex_xhigh_reprompt/human_readable_trace.py @@ -0,0 +1 @@ +../codex/human_readable_trace.py \ No newline at end of file diff --git a/agents/codex_xhigh_reprompt/solve.sh b/agents/codex_xhigh_reprompt/solve.sh new file mode 100755 index 00000000..3afc9730 --- /dev/null +++ b/agents/codex_xhigh_reprompt/solve.sh @@ -0,0 +1,34 @@ +#!/bin/bash +unset ANTHROPIC_API_KEY +unset GEMINI_API_KEY + +# Set reasoning effort to xhigh (prepend to ensure precedence) +file=/home/ben/.codex/config.toml +tmp="$(mktemp)" +printf 'model_reasoning_effort = "xhigh"\n\n' > "$tmp" +[ -f "$file" ] && cat "$file" >> "$tmp" +mv "$tmp" "$file" + +MIN_REMAINING_MINUTES=30 + +codex --search exec --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$PROMPT" + +# Re-prompt loop: if the agent finishes early, resume the session +while true; do + TIMER_OUTPUT=$(bash timer.sh 2>/dev/null) + if echo "$TIMER_OUTPUT" | grep -q "expired"; then + break + fi + + REMAINING_HOURS=$(echo "$TIMER_OUTPUT" | grep -oP '^\d+(?=:)') + REMAINING_MINS=$(echo "$TIMER_OUTPUT" | grep -oP '(?<=:)\d+') + TOTAL_REMAINING_MINS=$(( REMAINING_HOURS * 60 + REMAINING_MINS )) + + if [ "$TOTAL_REMAINING_MINS" -lt "$MIN_REMAINING_MINUTES" ]; then + break + fi + + CONTINUATION_PROMPT="You still have ${REMAINING_HOURS}h ${REMAINING_MINS}m remaining. Please continue improving your result and maximize performance." + + codex --search exec resume --last --json -c model_reasoning_summary=detailed --skip-git-repo-check --yolo --model "$AGENT_CONFIG" "$CONTINUATION_PROMPT" +done diff --git a/containers/gpt_5_5.def b/containers/gpt_5_5.def new file mode 100644 index 00000000..50105446 --- /dev/null +++ b/containers/gpt_5_5.def @@ -0,0 +1,78 @@ +Bootstrap: docker +From: nvidia/cuda:12.9.1-cudnn-devel-ubuntu22.04 + +%files + containers/requirements-direct.txt /opt/requirements-direct.txt + +%post + chmod 1777 /tmp + # Set environment variables + export DEBIAN_FRONTEND=noninteractive + + # Update and install system dependencies + apt-get update && apt-get install -y \ + python3.10 \ + python3-dev \ + git \ + wget \ + curl \ + build-essential \ + && rm -rf /var/lib/apt/lists/* + + # Create python3 symlink + ln -sf /usr/bin/python3.10 /usr/bin/python3 + ln -sf /usr/bin/python3.10 /usr/bin/python + + # Install Node.js (LTS version 22.x) for npm + curl -fsSL https://deb.nodesource.com/setup_22.x | bash - + apt-get install -y nodejs + + # Install uv + curl -LsSf https://astral.sh/uv/install.sh | sh + export PATH="/root/.local/bin:$PATH" + + uv pip install --system --no-cache vllm==0.11.0 --torch-backend=auto + + # Pinned direct dependencies + uv pip install --system --no-cache -r /opt/requirements-direct.txt + + # flash-attn (needs no-build-isolation) + uv pip install --system --no-cache flash-attn==2.8.3 --no-build-isolation + + # update CLI harnesss to most stable latest versions + # OpenCode doesn't support DeepSeek V4 yet. + npm install -g \ + @anthropic-ai/claude-code@2.1.116 \ + @openai/codex@0.124.0 \ + @google/gemini-cli@0.39.1 \ + opencode-ai@1.14.20 + + # install inspect evals + mkdir -p /opt + cd /opt + git clone https://github.com/UKGovernmentBEIS/inspect_evals.git + cd /opt/inspect_evals + git checkout 06001a83e6d7c709c2ede0570dce7f1031a0bad8 + uv pip install --system --no-cache . + + # install inspect ai with debug + mkdir -p /opt + cd /opt + git clone https://github.com/rank-and-file/inspect_ai_vllm_stdout.git + cd inspect_ai_vllm_stdout + uv pip install --system --no-cache . + +%environment + export PATH="/root/.local/bin:$PATH" + export NO_PROXY="localhost,127.0.0.1" + export no_proxy="localhost,127.0.0.1" + +%runscript + exec python3 "$@" + +%labels + Version v1.0 + Description Python ML container with CUDA support for transformers and LLM training (using uv) + AI CLI tools + +%help + Note: Use the --nv flag to enable NVIDIA GPU support when running the container. diff --git a/dev_utils/extract_traces.py b/dev_utils/extract_traces.py index 74e2e7c1..ae4affec 100644 --- a/dev_utils/extract_traces.py +++ b/dev_utils/extract_traces.py @@ -153,6 +153,11 @@ def main(): nargs="+", help="Input directory names (relative to RESULTS_BASE) to process" ) + parser.add_argument( + "--all", + action="store_true", + help="Copy all runs, not just the latest per task (default: latest only)" + ) args = parser.parse_args() output_base = Path(OUTPUT_DIR) @@ -175,8 +180,12 @@ def main(): print(f"\n[{input_dir_name}]") - # Iterate over only the latest subdirectories (highest ID per prefix) - for subdir in sorted(get_latest_subdirs(input_dir)): + # Iterate over subdirectories (latest per task by default, all with --all) + if args.all: + subdirs = sorted(d for d in input_dir.iterdir() if d.is_dir()) + else: + subdirs = sorted(get_latest_subdirs(input_dir)) + for subdir in subdirs: # Determine source file (prefer solve_parsed.txt) src_file = subdir / "solve_parsed.txt" if not src_file.exists(): @@ -201,6 +210,7 @@ def main(): copy_other_files(subdir, dest_dir, 'contamination_judgement.txt', api_keys=api_keys) copy_other_files(subdir, dest_dir, 'disallowed_model_judgement.txt', api_keys=api_keys) copy_other_files(subdir, dest_dir, 'error.log', 'judgement.log', api_keys=api_keys) + copy_other_files(subdir, dest_dir, 'time_taken.txt', api_keys=api_keys) copy_other_files(subdir, dest_dir, 'system_monitor.log', api_keys=api_keys, optional=True) tag = " [sanitized]" if was_sanitized else "" diff --git a/dev_utils/limit_hit_list.py b/dev_utils/limit_hit_list.py index 12f58093..9bfa144d 100644 --- a/dev_utils/limit_hit_list.py +++ b/dev_utils/limit_hit_list.py @@ -10,11 +10,13 @@ "You've hit your limit", # Claude Code Pro subscription limit "spending_limit", # Anthropic/OpenAI spending limit "billing_hard_limit", # OpenAI billing hard limit - "insufficient_quota", # OpenAI quota exceeded + "insufficient_quota", # OpenAI quota exceeded (structured error code) + "Quota exceeded. Check your plan", # OpenAI/Codex quota exceeded (turn.failed message) "budget_exceeded", # General budget error "plan does not yet include", # Z.AI subscription plan restriction "token_expired", # OpenAI/Codex expired auth token "Failed to refresh token", # Codex CLI refresh token failure + "Reconnecting... 5/5", # Codex CLI exhausted stream-reconnect retries ] diff --git a/dev_utils/terminated_finder.py b/dev_utils/terminated_finder.py index f7af3781..90e21feb 100644 --- a/dev_utils/terminated_finder.py +++ b/dev_utils/terminated_finder.py @@ -11,19 +11,23 @@ def get_results_dir(): return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") +KILLED_RE = re.compile(rb"run_task\.sh: line \d+: \d+ Killed") + + def classify_error(error_log_path: Path) -> str | None: """Classify the error in error.log. Returns 'terminated', 'killed', or None.""" if not error_log_path.exists(): return None try: - content = error_log_path.read_text() - if content.startswith("Terminated"): - return "terminated" - if re.search(r"\bKilled\b", content): - return "killed" - return None + with open(error_log_path, "rb") as f: + head = f.read(4096) except Exception: return None + if head.startswith(b"Terminated"): + return "terminated" + if KILLED_RE.search(head): + return "killed" + return None def get_latest_runs(method_path: Path): diff --git a/scripts/README.md b/scripts/README.md new file mode 100644 index 00000000..b6197f59 --- /dev/null +++ b/scripts/README.md @@ -0,0 +1,149 @@ +# scripts + +Post-hoc analysis utilities for PostTrainBench result directories. Most scripts +here read the contents of `$POST_TRAIN_BENCH_RESULTS_DIR` and produce CSV / +JSON aggregates; the exception is `rerun_eval_n_times.sh`, which actually +re-runs the model on a GPU. + +## Aggregating results into CSVs + +The pipeline is two scripts: `collect.py` reads raw run dirs into per-method +CSVs, then `aggregate.py` rolls those into per-agent avg/std and the weighted +leaderboard metric. + +### Typical flow + +From the repo root, with `POST_TRAIN_BENCH_RESULTS_DIR` pointing at the raw +results tree: + +```bash +# 1. Collect raw per-run data into per-method CSVs. +# Reads metrics.json + contamination/disallowed_model judgements + time_taken.txt, +# applies baseline-zeroshot fallback for contaminated/errored cells. +# Writes: +# final_{method}.csv — score grid (model x benchmark) with fallback +# contamination_{method}.csv — flags ("", "C", "M", "MC", or error string) +# time_overview.csv — average wall time per method +python scripts/collect.py + +# 2. Aggregate across runs/agents and compute the weighted leaderboard metric. +# Reads final_{method}.csv produced above. Writes: +# aggregated_avg_{agent}.csv — per-cell mean across runs (one per multi-run agent) +# aggregated_std_{agent}.csv — per-cell sample stddev (n-1) +# single_metrics.csv — weighted score per individual run +# single_metrics_aggregated.csv — agent-level avg/std/n on the weighted metric +# time_aggregated.csv — agent-level avg/std wall time +python scripts/aggregate.py +``` + +`aggregate.py` skips agents whose run CSVs aren't present in this results +dir, so it's safe to run against a partial tree. + +### `collect.py` flags + +```bash +python scripts/collect.py \ + --data-dir /path/to/results \ # default: $POST_TRAIN_BENCH_RESULTS_DIR + --output-dir /path/to/out \ # default: same as --data-dir + --min-run-id 17000000 \ # inclusive lower bound on cluster_id + --max-run-id 17200000 # exclusive upper bound on cluster_id +``` + +### `aggregate.py` flags + +By default `--all` is implied (write everything). Use the flags below to +restrict to one stage: + +```bash +python scripts/aggregate.py --per-cell # only aggregated_avg/std_{agent}.csv +python scripts/aggregate.py --leaderboard # only single_metrics{,_aggregated}.csv +python scripts/aggregate.py --time # only time_aggregated.csv +``` + +Same `--data-dir` / `--output-dir` flags as `collect.py`. + +### Hardcoded things + +| File | What it pins | +|---|---| +| `constants.py` (`HARDCODED_AGENT_MAP`) | Which run directories belong to which agent (multi-run agents are how stddev is computed) | +| `constants.py` (`HARDCODED_BENCHMARKS`, `EXPECTED_MODELS`) | Benchmark + base-model lists | +| `factors.json` | Per-benchmark weights for the weighted leaderboard metric | +| `baselines.json` | Hardcoded zero-shot + few-shot baseline scores; used as fallback for contaminated/errored cells (no longer recomputed at every run) | + +To add a new agent: add its run-dir names to `HARDCODED_AGENT_MAP` in +`constants.py`. To add a new benchmark: extend `HARDCODED_BENCHMARKS` and add +a weight to `factors.json`. + +### `verify.py` (refactor regression check) + +`verify.py` is a one-off script used when the new pipeline was +rolled out — it compares two CSV output dirs cell-by-cell with float +tolerance, used to confirm the new pipeline matches the old one byte-for-byte +(except for filename renames). Not part of the normal workflow. + +```bash +python scripts/verify.py \ + --ground-truth /fast/.../ptb_results_old \ + --new-output /fast/.../ptb_results_new +``` + +## Other helpers + +| Script | Description | +|---|---| +| `compute_claude_costs.py` | Claude API spend rollup | +| `extract_token_usage.py` | Token-usage extraction from agent traces | +| `migrate_judgement_files.py` | One-off: migrate older judgement file naming | +| `list_safetensors.py` | List safetensors files under a result tree | +| `parse_all_to_human_readable.sh` | Run human-readable trace parsers across results | +| `baselines.json`, `factors.json`, `constants.py`, `utils.py` | Shared config / helpers | + +## Re-evaluating a finished run N times + +`rerun_eval_n_times.sh` re-evaluates a job's `final_model/` N times and writes +mean / std / stderr / min / max per metric into `metrics_averaged.json`. Useful +because each job's standard `metrics.json` is a single decoding sample per +question and does not capture decoding noise. + +It mirrors `src/run_task.sh`'s evaluation step exactly: + +- runs `src/eval/tasks//evaluate.py` (the live source — **not** the + potentially-modified snapshot in `/task/`) +- inside the same `${POST_TRAIN_BENCH_CONTAINER_NAME}.sif` container +- with the same fuse-overlayfs HF cache pattern (`with_huggingface_overlay`) +- using the same `--max-tokens` fallback ladder per task + +Per-run JSONs are written to `/reruns/run_{i}.json` (with +`run_{i}_{level}.log` alongside). The aggregated file is `/metrics_averaged.json`. + +### Files + +| File | Description | +|---|---| +| `rerun_eval_n_times.sh` | Driver: re-runs `evaluate.py` N times on one EVAL_DIR and aggregates | +| `aggregate_metrics_runs.py` | Helper called by the driver: computes mean/std/stderr/min/max from per-run JSONs | +| `../src/commit_utils/rerun_eval.sub` | HTCondor submission file | + +### Usage + +#### Locally on a GPU node + +From the repo root: + +```bash +scripts/rerun_eval_n_times.sh /path/to/EVAL_DIR 5 +``` + +`EVAL_DIR` must be an existing job directory containing `final_model/`. The +task name is parsed from the basename (`__`) to +pick the correct max-tokens fallback ladder. + +#### HTCondor + +```bash +condor_submit_bid 50 \ + -a "eval_dir=/path/to/EVAL_DIR" \ + -a "n=5" \ + src/commit_utils/rerun_eval.sub +``` diff --git a/scripts/aggregate.py b/scripts/aggregate.py new file mode 100644 index 00000000..4ecb2457 --- /dev/null +++ b/scripts/aggregate.py @@ -0,0 +1,347 @@ +#!/usr/bin/env python3 +""" +Aggregate results across multiple runs per agent. + +Reads final_{method}.csv files produced by collect.py and computes: + --per-cell : aggregated_avg_{agent}.csv, aggregated_std_{agent}.csv + --leaderboard : single_metrics.csv, single_metrics_aggregated.csv + --time : time_aggregated.csv + --all : everything (default) + +Usage: + python aggregate.py + python aggregate.py --data-dir /path/to/results --output-dir /path/to/output + python aggregate.py --per-cell --leaderboard +""" +import argparse +import csv +import os +import re + +from utils import ( + get_results_dir, + load_csv_as_dict, + write_csv, + load_factors, + mean, + stddev, + is_number, + format_time_hms, + HARDCODED_AGENT_MAP, + HARDCODED_BENCHMARKS, + EXPECTED_MODELS, +) + + +# --------------------------------------------------------------------------- +# Per-cell avg/std across runs +# --------------------------------------------------------------------------- + +def aggregate_per_cell( + agent_name: str, + method_names: list[str], + data_dir: str, + output_dir: str, +): + """ + For each (model, benchmark) cell, compute mean and sample stddev + across the runs. Write aggregated_avg_{agent}.csv and aggregated_std_{agent}.csv. + """ + all_data = [] + all_models = None + + for method_name in method_names: + csv_path = os.path.join(data_dir, f"final_{method_name}.csv") + data, _ = load_csv_as_dict(csv_path) + + models = sorted(data.keys()) + if all_models is None: + all_models = models + else: + assert all_models == models, ( + f"Model mismatch for {method_name}: " + f"expected {all_models}, got {models}" + ) + all_data.append(data) + + avg_data = {} + std_data = {} + + for model in all_models: + avg_data[model] = {} + std_data[model] = {} + + for bench in HARDCODED_BENCHMARKS: + values = [] + for data in all_data: + values.append(float(data[model][bench])) + + avg_data[model][bench] = str(mean(values)) + std_data[model][bench] = str(stddev(values)) + + avg_path = os.path.join(output_dir, f"aggregated_avg_{agent_name}.csv") + write_csv(avg_path, all_models, HARDCODED_BENCHMARKS, avg_data) + print(f"Written: {avg_path}") + + std_path = os.path.join(output_dir, f"aggregated_std_{agent_name}.csv") + write_csv(std_path, all_models, HARDCODED_BENCHMARKS, std_data) + print(f"Written: {std_path}") + + return avg_data, std_data + + +# --------------------------------------------------------------------------- +# Weighted single metric +# --------------------------------------------------------------------------- + +def compute_weighted_metric( + data: dict[str, dict[str, str]], + factors: dict[str, float], +) -> float: + """ + Compute weighted sum: for each benchmark, average across models, + multiply by factor, sum. + """ + valid_benchmarks = set(factors.keys()) + total = 0.0 + num_models = len(data) + for bench in sorted(valid_benchmarks): + values = [float(data[model][bench]) for model in data] + avg_value = sum(values) / num_models + total += avg_value * factors[bench] + return total + + +def aggregate_leaderboard(data_dir: str, output_dir: str): + """ + Compute weighted metric for every final_*.csv that has all expected models. + Then group by HARDCODED_AGENT_MAP for avg/std. + + Also writes final_avg_{agent}.csv and final_std_{agent}.csv (identical to + aggregated_ versions) so their metrics appear in single_metrics.csv. + """ + factors = load_factors() + valid_benchmarks = set(factors.keys()) + + # Phase 1: compute per-cell avg/std and write final_avg/std files + # so they get picked up in the metric scan below + for agent_name, method_names in HARDCODED_AGENT_MAP.items(): + avg_data, std_data = _load_avg_std_for_agent( + agent_name, method_names, data_dir + ) + if avg_data is not None: + # Write final_avg_{agent}.csv (identical to aggregated_avg_) + avg_path = os.path.join(output_dir, f"final_avg_{agent_name}.csv") + write_csv( + avg_path, + sorted(avg_data.keys()), + HARDCODED_BENCHMARKS, + avg_data, + ) + std_path = os.path.join(output_dir, f"final_std_{agent_name}.csv") + write_csv( + std_path, + sorted(std_data.keys()), + HARDCODED_BENCHMARKS, + std_data, + ) + + # Phase 2: compute metrics for ALL final_*.csv files in the output dir + all_metrics = {} + + for filename in os.listdir(output_dir): + if not filename.startswith("final_"): + continue + if not filename.endswith(".csv"): + continue + if filename.startswith("final_time_"): + continue + + csv_path = os.path.join(output_dir, filename) + try: + data, _ = load_csv_as_dict(csv_path) + except Exception: + print(f"Warning: could not load {csv_path}.") + raise + + if set(data.keys()) != EXPECTED_MODELS: + continue + + method_name = filename[len("final_"):-len(".csv")] + all_metrics[method_name] = compute_weighted_metric(data, factors) + + # Write individual metrics + metrics_path = os.path.join(output_dir, "single_metrics.csv") + with open(metrics_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["method", "metric"]) + for method_name in sorted(all_metrics.keys()): + writer.writerow([method_name, all_metrics[method_name]]) + print(f"Written: {metrics_path}") + + # Compute aggregated metrics per agent group + aggregated_path = os.path.join(output_dir, "single_metrics_aggregated.csv") + with open(aggregated_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["agent", "avg", "std", "n"]) + for agent_name in sorted(HARDCODED_AGENT_MAP.keys()): + method_names = HARDCODED_AGENT_MAP[agent_name] + # Skip agents with missing runs + if not all(m in all_metrics for m in method_names): + print(f"Skipping agent {agent_name} in leaderboard: missing metrics") + continue + metrics = [all_metrics[m] for m in method_names] + writer.writerow([ + agent_name, + mean(metrics), + stddev(metrics), + len(metrics), + ]) + print(f"Written: {aggregated_path}") + + +def _load_avg_std_for_agent( + agent_name: str, + method_names: list[str], + data_dir: str, +) -> tuple[dict | None, dict | None]: + """Load final_*.csv for each run and compute per-cell avg/std.""" + all_data = [] + all_models = None + + for method_name in method_names: + csv_path = os.path.join(data_dir, f"final_{method_name}.csv") + if not os.path.exists(csv_path): + return None, None + data, _ = load_csv_as_dict(csv_path) + models = sorted(data.keys()) + if all_models is None: + all_models = models + all_data.append(data) + + avg_data = {} + std_data = {} + for model in all_models: + avg_data[model] = {} + std_data[model] = {} + for bench in HARDCODED_BENCHMARKS: + values = [float(d[model][bench]) for d in all_data] + avg_data[model][bench] = str(mean(values)) + std_data[model][bench] = str(stddev(values)) + + return avg_data, std_data + + +# --------------------------------------------------------------------------- +# Time aggregation +# --------------------------------------------------------------------------- + +def parse_time_to_hours(time_str: str) -> float: + """Parse time string like '8:17:28' to hours as float.""" + parts = time_str.split(":") + hours = int(parts[0]) + minutes = int(parts[1]) + seconds = int(parts[2]) + return hours + minutes / 60 + seconds / 3600 + + +def aggregate_time(data_dir: str, output_dir: str): + """ + Read time_overview.csv, group by HARDCODED_AGENT_MAP, compute avg/std. + Write time_aggregated.csv. + """ + time_csv_path = os.path.join(data_dir, "time_overview.csv") + + time_data = {} + with open(time_csv_path, "r", newline="") as f: + reader = csv.DictReader(f) + for row in reader: + method = row["method"] + avg_time = row["average_time"] + if avg_time and avg_time != "N/A": + time_data[method] = parse_time_to_hours(avg_time) + + output_path = os.path.join(output_dir, "time_aggregated.csv") + with open(output_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["agent", "avg_time", "std_time", "n"]) + for agent_name, method_names in HARDCODED_AGENT_MAP.items(): + if not all(m in time_data for m in method_names): + print(f"Skipping agent {agent_name} in time: missing data") + continue + hours_list = [time_data[m] for m in method_names] + writer.writerow([ + agent_name, + format_time_hms(int(mean(hours_list) * 3600)), + format_time_hms(int(stddev(hours_list) * 3600)), + len(hours_list), + ]) + print(f"Written: {output_path}") + + +def _all_finals_exist(method_names: list[str], data_dir: str) -> bool: + """Check if all final_*.csv files exist for the given methods.""" + return all( + os.path.exists(os.path.join(data_dir, f"final_{m}.csv")) + for m in method_names + ) + + +# --------------------------------------------------------------------------- +# CLI +# --------------------------------------------------------------------------- + +def parse_args(): + parser = argparse.ArgumentParser( + description="Aggregate results across multiple runs per agent." + ) + parser.add_argument( + "--data-dir", + default=None, + help="Directory containing final_*.csv files (from collect.py). " + "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.", + ) + parser.add_argument( + "--output-dir", + default=None, + help="Directory to write output CSVs. Defaults to same as --data-dir.", + ) + parser.add_argument("--per-cell", action="store_true", + help="Write per-cell avg/std CSVs per agent.") + parser.add_argument("--leaderboard", action="store_true", + help="Write single_metrics.csv and single_metrics_aggregated.csv.") + parser.add_argument("--time", action="store_true", + help="Write time_aggregated.csv.") + parser.add_argument("--all", action="store_true", + help="Write everything (default if no flags given).") + return parser.parse_args() + + +def main(): + args = parse_args() + + data_dir = args.data_dir or get_results_dir() + output_dir = args.output_dir or data_dir + + os.makedirs(output_dir, exist_ok=True) + + do_all = args.all or not (args.per_cell or args.leaderboard or args.time) + + if do_all or args.per_cell: + for agent_name, method_names in HARDCODED_AGENT_MAP.items(): + # Skip agents whose run data isn't available + if not _all_finals_exist(method_names, data_dir): + print(f"Skipping agent {agent_name}: missing final CSVs") + continue + print(f"Processing agent: {agent_name}") + aggregate_per_cell(agent_name, method_names, data_dir, output_dir) + + if do_all or args.leaderboard: + aggregate_leaderboard(data_dir, output_dir) + + if do_all or args.time: + aggregate_time(data_dir, output_dir) + + +if __name__ == "__main__": + main() diff --git a/scripts/aggregate.sh b/scripts/aggregate.sh deleted file mode 100644 index a3a41e18..00000000 --- a/scripts/aggregate.sh +++ /dev/null @@ -1,54 +0,0 @@ -#!/bin/bash -source src/commit_utils/set_env_vars.sh - -echo "===============================" -echo "Aggregating method results..." -python scripts/aggregate_methods.py -echo "===============================" -echo "Aggregating time results..." -python scripts/aggregate_time_baselines.py -echo "===============================" -echo "Aggregating contamination results..." -python scripts/aggregate_contamination.py - -python scripts/aggregate_time.py -sleep 1 -python scripts/aggregate_final.py -sleep 1 -python scripts/aggregate_summary.py \ - claude_claude-opus-4-6_10h_run1_old_container \ - claude_claude-opus-4-6_10h_run2 \ - claude_claude-opus-4-6_10h_run3 \ - codex_non_api_gpt-5.3-codex_10h_run1 \ - codex_non_api_gpt-5.3-codex_10h_run2 \ - codex_non_api_gpt-5.3-codex_10h_run3 \ - opencode_opencode_glm-5_10h_run2 \ - opencode_opencode_kimi-k2.5_10h_run2 \ - opencode_opencode_minimax-m2.5-free_10h_run2 \ - opencode_zai_glm-5_10h_run2 \ - codex_non_api_high_gpt-5.3-codex_10h_run1 \ - codex_non_api_high_gpt-5.3-codex_10h_run2 \ - codex_non_api_high_gpt-5.3-codex_10h_run3 \ - codex_non_api_high_gpt-5.4_10h_run1 \ - codex_non_api_high_gpt-5.4_10h_run2 \ - codex_non_api_high_gpt-5.4_10h_run3 \ - claude_non_api_claude-opus-4-6_1m__10h_run1 \ - claude_non_api_claude-opus-4-6_1m__10h_run2 \ - claude_non_api_claude-opus-4-6_1m__10h_run3 - # opencode_anthropic_claude-opus-4-5_10h \ - # opencode_opencode_big-pickle_10h \ - # opencode_opencode_gemini-3-pro_10h \ - # opencode_opencode_glm-4.7-free_10h \ - # opencode_opencode_gpt-5.1-codex-max_10h \ - # opencode_opencode_kimi-k2-thinking_10h \ - # opencode_opencode_minimax-m2.1-free_10h \ - # qwen3max_qwen3-max-2026-01-23_10h - -# python scripts/aggregate_together.py \ -# opencode_anthropic_claude-opus-4-5_10h \ -# opencode_opencode_big-pickle_10h \ -# opencode_opencode_gemini-3-pro_10h \ -# opencode_opencode_glm-4.7-free_10h \ -# opencode_opencode_gpt-5.1-codex-max_10h \ -# opencode_opencode_kimi-k2-thinking_10h \ -# opencode_opencode_minimax-m2.1-free_10h \ No newline at end of file diff --git a/scripts/aggregate_avg_stddev.py b/scripts/aggregate_avg_stddev.py deleted file mode 100755 index b962c792..00000000 --- a/scripts/aggregate_avg_stddev.py +++ /dev/null @@ -1,131 +0,0 @@ -#!/usr/bin/env python3 -""" -Aggregate results across multiple runs for each agent. - -Takes the outputs of aggregate_final.py (final_*.csv files) and combines -them into average and standard deviation CSVs for each agent group. -""" -import os -import csv -import math - -from constants import HARDCODED_AGENT_MAP, HARDCODED_BENCHMARKS - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def mean(values: list[float]) -> float: - return sum(values) / len(values) - - -def stddev(values: list[float]) -> float: - avg = mean(values) - variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1) - return math.sqrt(variance) - - -def load_csv_as_dict(csv_path: str) -> tuple[dict[str, dict[str, str]], list[str]]: - """ - Load a CSV file into a dict of dicts: {model: {benchmark: value}}. - Returns (data_dict, list_of_benchmarks). - """ - data = {} - benchmarks = [] - - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader) - - # First column is "model", rest are benchmarks - benchmarks = header[1:] - - for row in reader: - model = row[0] - data[model] = {} - for i, bench in enumerate(benchmarks): - data[model][bench] = row[i + 1] - - return data, benchmarks - - -def aggregate_runs(agent_name: str, method_names: list[str], results_dir: str): - """ - Aggregate results from multiple method runs into average and std CSV files. - """ - # Load all method data - all_data = [] - all_models = None - - for method_name in method_names: - csv_path = os.path.join(results_dir, f"final_{method_name}.csv") - data, _ = load_csv_as_dict(csv_path) - - models = sorted(data.keys()) - if all_models is None: - all_models = models - else: - assert all_models == models, ( - f"Model mismatch for {method_name}: " - f"expected {all_models}, got {models}" - ) - - all_data.append(data) - - # Compute average and std for each (model, benchmark) cell - avg_data = {} - std_data = {} - - for model in all_models: - avg_data[model] = {} - std_data[model] = {} - - for bench in HARDCODED_BENCHMARKS: - values = [] - for data in all_data: - value_str = data[model][bench] - value = float(value_str) - values.append(value) - - avg_data[model][bench] = str(mean(values)) - std_data[model][bench] = str(stddev(values)) - - # Write average CSV - avg_path = os.path.join(results_dir, f"aggregated_avg_{agent_name}.csv") - write_csv(avg_path, all_models, HARDCODED_BENCHMARKS, avg_data) - print(f"Written: {avg_path}") - - # Write std CSV - std_path = os.path.join(results_dir, f"aggregated_std_{agent_name}.csv") - write_csv(std_path, all_models, HARDCODED_BENCHMARKS, std_data) - print(f"Written: {std_path}") - - -def write_csv( - path: str, - models: list[str], - benchmarks: list[str], - data: dict[str, dict[str, str]], -): - """Write data dict to CSV file.""" - with open(path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["model"] + benchmarks) - - for model in models: - row = [model] - for bench in benchmarks: - row.append(data[model][bench]) - writer.writerow(row) - - -def main(): - results_dir = get_results_dir() - - for agent_name, method_names in HARDCODED_AGENT_MAP.items(): - print(f"Processing agent: {agent_name}") - aggregate_runs(agent_name, method_names, results_dir) - - -if __name__ == "__main__": - main() diff --git a/scripts/aggregate_avg_stddev_over_benchmarks.py b/scripts/aggregate_avg_stddev_over_benchmarks.py deleted file mode 100755 index 71c65e39..00000000 --- a/scripts/aggregate_avg_stddev_over_benchmarks.py +++ /dev/null @@ -1,199 +0,0 @@ -#!/usr/bin/env python3 -""" -Aggregate results across multiple runs for each agent, averaging over models. - -Takes the outputs of aggregate_final.py (final_*.csv files), averages values -over models for each benchmark, then computes average and standard deviation -across runs for each agent. - -Produces two CSV files: one for averages, one for standard deviations. -Rows are benchmarks, columns are agents. -""" -import os -import csv -import math - -from constants import HARDCODED_AGENT_MAP, HARDCODED_BENCHMARKS - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def mean(values: list[float]) -> float: - return sum(values) / len(values) - - -def stddev(values: list[float]) -> float: - avg = mean(values) - variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1) - return math.sqrt(variance) - - -def load_csv_as_dict(csv_path: str) -> dict[str, dict[str, str]]: - """ - Load a CSV file into a dict of dicts: {model: {benchmark: value}}. - """ - data = {} - - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader) - - # First column is "model", rest are benchmarks - benchmarks = header[1:] - - for row in reader: - model = row[0] - data[model] = {} - for i, bench in enumerate(benchmarks): - data[model][bench] = row[i + 1] - - return data - - -def compute_model_average(data: dict[str, dict[str, str]], bench: str) -> float: - """Compute average value over all models for a given benchmark.""" - values = [] - for model in data: - value = float(data[model][bench]) - values.append(value) - return mean(values) - - -def aggregate_agent(method_names: list[str], results_dir: str): - """ - Aggregate results from multiple method runs for one agent. - Returns (avg_per_benchmark, std_per_benchmark, avg_per_model_benchmark, std_per_model_benchmark, models). - - avg_per_benchmark[bench] = avg value (averaged over models) - avg_per_model_benchmark[model][bench] = avg value (per model) - """ - # For each run, compute model-averaged value per benchmark - # run_averages[benchmark] = [avg_run1, avg_run2, ...] - run_averages = {bench: [] for bench in HARDCODED_BENCHMARKS} - - # For each run, also store per-model values - # run_values_per_model[model][benchmark] = [val_run1, val_run2, ...] - run_values_per_model = {} - all_models = None - - for method_name in method_names: - csv_path = os.path.join(results_dir, f"final_{method_name}.csv") - data = load_csv_as_dict(csv_path) - - models = sorted(data.keys()) - if all_models is None: - all_models = models - for model in models: - run_values_per_model[model] = {bench: [] for bench in HARDCODED_BENCHMARKS} - - for bench in HARDCODED_BENCHMARKS: - model_avg = compute_model_average(data, bench) - run_averages[bench].append(model_avg) - - for model in all_models: - value = float(data[model][bench]) - run_values_per_model[model][bench].append(value) - - # Compute avg and std across runs for each benchmark (averaged over models) - avg_per_benchmark = {} - std_per_benchmark = {} - - for bench in HARDCODED_BENCHMARKS: - values = run_averages[bench] - avg_per_benchmark[bench] = mean(values) - std_per_benchmark[bench] = stddev(values) - - # Compute avg and std across runs for each (model, benchmark) pair - avg_per_model_benchmark = {} - std_per_model_benchmark = {} - - for model in all_models: - avg_per_model_benchmark[model] = {} - std_per_model_benchmark[model] = {} - for bench in HARDCODED_BENCHMARKS: - values = run_values_per_model[model][bench] - avg_per_model_benchmark[model][bench] = mean(values) - std_per_model_benchmark[model][bench] = stddev(values) - - return avg_per_benchmark, std_per_benchmark, avg_per_model_benchmark, std_per_model_benchmark, all_models - - -def main(): - results_dir = get_results_dir() - - # Collect results for all agents - # all_avg[benchmark][agent] = avg_value - # all_std[benchmark][agent] = std_value - all_avg = {bench: {} for bench in HARDCODED_BENCHMARKS} - all_std = {bench: {} for bench in HARDCODED_BENCHMARKS} - - # Per-model results: all_avg_per_model[model][benchmark][agent] = avg_value - all_avg_per_model = {} - all_std_per_model = {} - - agent_names = list(HARDCODED_AGENT_MAP.keys()) - all_models = None - - for agent_name, method_names in HARDCODED_AGENT_MAP.items(): - print(f"Processing agent: {agent_name}") - avg_per_benchmark, std_per_benchmark, avg_per_model, std_per_model, models = aggregate_agent(method_names, results_dir) - - if all_models is None: - all_models = models - for model in models: - all_avg_per_model[model] = {bench: {} for bench in HARDCODED_BENCHMARKS} - all_std_per_model[model] = {bench: {} for bench in HARDCODED_BENCHMARKS} - - for bench in HARDCODED_BENCHMARKS: - all_avg[bench][agent_name] = avg_per_benchmark[bench] - all_std[bench][agent_name] = std_per_benchmark[bench] - - for model in all_models: - all_avg_per_model[model][bench][agent_name] = avg_per_model[model][bench] - all_std_per_model[model][bench][agent_name] = std_per_model[model][bench] - - # Write average CSV (over models) - avg_path = os.path.join(results_dir, "aggregated_avg_over_models.csv") - with open(avg_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["benchmark"] + agent_names) - for bench in HARDCODED_BENCHMARKS: - row = [bench] + [all_avg[bench][agent] for agent in agent_names] - writer.writerow(row) - print(f"Written: {avg_path}") - - # Write std CSV (over models) - std_path = os.path.join(results_dir, "aggregated_std_over_models.csv") - with open(std_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["benchmark"] + agent_names) - for bench in HARDCODED_BENCHMARKS: - row = [bench] + [all_std[bench][agent] for agent in agent_names] - writer.writerow(row) - print(f"Written: {std_path}") - - # Write per-model CSV files - for model in all_models: - avg_path = os.path.join(results_dir, f"aggregated_avg_{model}.csv") - with open(avg_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["benchmark"] + agent_names) - for bench in HARDCODED_BENCHMARKS: - row = [bench] + [all_avg_per_model[model][bench][agent] for agent in agent_names] - writer.writerow(row) - print(f"Written: {avg_path}") - - std_path = os.path.join(results_dir, f"aggregated_std_{model}.csv") - with open(std_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["benchmark"] + agent_names) - for bench in HARDCODED_BENCHMARKS: - row = [bench] + [all_std_per_model[model][bench][agent] for agent in agent_names] - writer.writerow(row) - print(f"Written: {std_path}") - - -if __name__ == "__main__": - main() diff --git a/scripts/aggregate_contamination.py b/scripts/aggregate_contamination.py deleted file mode 100644 index 01a4fabd..00000000 --- a/scripts/aggregate_contamination.py +++ /dev/null @@ -1,154 +0,0 @@ -#!/usr/bin/env python3 -import os -import csv - -OUTPUT_PREFIX = "contamination_" # e.g. "contamination_method.csv" - - -def load_contamination(contamination_path: str): - """ - Return True, False, "IMPORTANT ERR", or "ERR" based on contamination judgement. - """ - if not os.path.exists(contamination_path): - return "ERR" - try: - with open(contamination_path, "r") as f: - content = f.read().strip() - except Exception: - return "ERR" - if content == "contamination detected": - return True - elif content == "no contamination detected": - return False - else: - return "IMPORTANT ERR" - - -def load_disallowed_model(disallowed_path: str): - """ - Return True, False, "IMPORTANT ERR", or "ERR" based on disallowed model judgement. - """ - if not os.path.exists(disallowed_path): - return "ERR" - try: - with open(disallowed_path, "r") as f: - content = f.read().strip() - except Exception: - return "ERR" - if content == "disallowed use detected": - return True - elif content == "only allowed use detected": - return False - else: - return "IMPORTANT ERR" - - -def combine_results(contamination, disallowed_model): - """ - Combine contamination and disallowed model results into a single cell value. - - Returns: - - "" if no illegal use detected (and no contamination) - - "M" if disallowed model detected (but no contamination) - - "MC" if disallowed model detected & contamination - - "C" if only contamination detected (and no disallowed model) - - Error string if either result is an error - """ - # Handle error cases first - if contamination in ("ERR", "IMPORTANT ERR") or disallowed_model in ("ERR", "IMPORTANT ERR"): - errors = [] - if contamination in ("ERR", "IMPORTANT ERR"): - errors.append(f"C:{contamination}") - if disallowed_model in ("ERR", "IMPORTANT ERR"): - errors.append(f"M:{disallowed_model}") - return " ".join(errors) - - # Both are boolean now - if disallowed_model and contamination: - return "MC" - elif disallowed_model and not contamination: - return "M" - elif not disallowed_model and contamination: - return "C" - else: # not disallowed_model and not contamination - return "" - - -def process_method(method_path: str, method_name: str): - """ - For a single method dir (results/method_name), collect the newest run per - (benchmark, model), then write a CSV. - """ - # key: (benchmark, model) -> value: {"run_id": int, "path": str} - latest_runs = {} - - for entry in os.listdir(method_path): - entry_path = os.path.join(method_path, entry) - if not os.path.isdir(entry_path): - continue - try: - benchmark, _, model, run_id = entry.split("_") - key = (benchmark, model) - except ValueError as e: - print(entry) - raise ValueError(f"{entry}, {method_path}") - - # keep only highest run_id per (benchmark, model) - if key not in latest_runs or run_id > latest_runs[key]["run_id"]: - latest_runs[key] = { - "run_id": run_id, - "path": entry_path, - } - - if not latest_runs: - # nothing to do for this method - return - - # Collect distinct benchmarks and models - benchmarks = sorted({b for (b, m) in latest_runs.keys()}) - models = sorted({m for (b, m) in latest_runs.keys()}) - - # Prepare CSV path (next to results/ or inside results/) - csv_path = os.path.join(get_results_dir(), f"{OUTPUT_PREFIX}{method_name}.csv") - - with open(csv_path, "w", newline="") as csvfile: - writer = csv.writer(csvfile) - # header - writer.writerow(["model"] + benchmarks) - - # rows - for model in models: - row = [model] - for bench in benchmarks: - cell = "" - key = (bench, model) - if key in latest_runs: - run_dir = latest_runs[key]["path"] - contamination_path = os.path.join(run_dir, "contamination_judgement.txt") - disallowed_path = os.path.join(run_dir, "disallowed_model_judgement.txt") - - contamination = load_contamination(contamination_path) - disallowed_model = load_disallowed_model(disallowed_path) - cell = combine_results(contamination, disallowed_model) - row.append(cell) - writer.writerow(row) - - print(f"Written: {csv_path}") - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results') - - -def main(): - results_dir = get_results_dir() - for method_name in os.listdir(results_dir): - method_path = os.path.join(results_dir, method_name) - if not os.path.isdir(method_path): - continue - # treat every subdirectory of results/ as a "method" - process_method(method_path, method_name) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/aggregate_final.py b/scripts/aggregate_final.py deleted file mode 100644 index 86becb2b..00000000 --- a/scripts/aggregate_final.py +++ /dev/null @@ -1,168 +0,0 @@ -#!/usr/bin/env python3 -""" -Final aggregation script that combines method results with baseline fallbacks. - -For each method's aggregated CSV, replaces values with baseline when: -1. The value is not a number (e.g., "ERR", "not avl.", etc.), OR -2. The corresponding contamination value is not empty (flagged as "C", "M", "MC", etc.) - -Baseline values come from aggregated_baseline.csv. -""" -import os -import csv -import argparse - -OUTPUT_PREFIX = "final_" - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def is_number(value: str) -> bool: - """Check if a string represents a number (int or float).""" - if not value: - return False - try: - float(value) - return True - except ValueError: - return False - - -def load_csv_as_dict(csv_path: str) -> tuple[dict, list]: - """ - Load a CSV file into a dict of dicts: {model: {benchmark: value}}. - Returns (data_dict, list_of_benchmarks). - """ - data = {} - benchmarks = [] - - if not os.path.exists(csv_path): - return data, benchmarks - - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader, None) - if not header: - return data, benchmarks - - # First column is "model", rest are benchmarks - benchmarks = header[1:] - - for row in reader: - if not row: - continue - model = row[0] - data[model] = {} - for i, bench in enumerate(benchmarks): - if i + 1 < len(row): - data[model][bench] = row[i + 1] - else: - data[model][bench] = "" - - return data, benchmarks - - -def process_method(method_name: str, baseline_data: dict, results_dir: str): - """ - Process a single method: load its aggregated and contamination CSVs, - apply baseline fallbacks where needed, and write the final CSV. - """ - aggregated_path = os.path.join(results_dir, f"aggregated_{method_name}.csv") - contamination_path = os.path.join(results_dir, f"contamination_{method_name}.csv") - - # Load method data - method_data, method_benchmarks = load_csv_as_dict(aggregated_path) - if not method_data: - return - - # Load contamination data (may not exist) - contamination_data, _ = load_csv_as_dict(contamination_path) - - # Get all models from method data - models = sorted(method_data.keys()) - - # Process each cell and apply baseline if needed - for model in models: - for bench in method_benchmarks: - value = method_data[model].get(bench, "") - contamination_value = contamination_data.get(model, {}).get(bench, "") - - # Check conditions for baseline replacement - needs_baseline = False - - # Condition 1: value is not a number - if not is_number(value): - needs_baseline = True - - # Condition 2: contamination value is not empty - if contamination_value.strip(): - needs_baseline = True - - if needs_baseline: - # Get baseline value (may be empty if model/bench not in baseline) - baseline_value = baseline_data.get(model, {}).get(bench, "") - method_data[model][bench] = baseline_value - - # Write output - output_path = os.path.join(results_dir, f"{OUTPUT_PREFIX}{method_name}.csv") - - with open(output_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["model"] + method_benchmarks) - - for model in models: - row = [model] - for bench in method_benchmarks: - row.append(method_data[model].get(bench, "")) - writer.writerow(row) - - print(f"Written: {output_path}") - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Create final aggregated CSVs with baseline fallbacks." - ) - parser.add_argument( - "--methods", - nargs="*", - default=None, - help="Specific methods to process. If not provided, processes all methods.", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - results_dir = get_results_dir() - - # Load baseline data - baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv") - baseline_data, _ = load_csv_as_dict(baseline_path) - - if not baseline_data: - print(f"Warning: No baseline data found at {baseline_path}") - - # Determine which methods to process - if args.methods: - method_names = args.methods - else: - # Find all aggregated method files (excluding baseline) - method_names = [] - for filename in os.listdir(results_dir): - if not filename.startswith("aggregated_") or not filename.endswith(".csv"): - continue - method_name = filename[len("aggregated_") : -len(".csv")] - # Skip baseline itself - if method_name != "baseline_zeroshot": - method_names.append(method_name) - - # Process each method - for method_name in sorted(method_names): - process_method(method_name, baseline_data, results_dir) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/aggregate_methods.py b/scripts/aggregate_methods.py deleted file mode 100644 index 734782a5..00000000 --- a/scripts/aggregate_methods.py +++ /dev/null @@ -1,152 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import os -import json -import csv - -OUTPUT_PREFIX = "aggregated_" # e.g. "agg_" if you want names like agg_method.csv - -def load_metrics(metrics_path: str, method_name: str = None): - """ - Return a string suitable for the CSV. - - Always returns the metrics data if metrics.json exists and is valid - - Only shows error messages if metrics.json doesn't exist or is invalid: - For non-baseline methods: - - "not avl." if time_taken.txt doesn't exist in the folder - - "not stored" if time_taken.txt exists but final_model subfolder doesn't - - "ERR" for other errors - For baseline method: - - Just return "ERR" for any errors (old behavior) - """ - # First, try to load metrics.json - if it works, return the data immediately - if os.path.exists(metrics_path): - try: - with open(metrics_path, "r") as f: - data = json.load(f) - - acc = data.get("accuracy") - if acc is not None: - return str(acc) - except Exception: - pass # Fall through to error handling below - - # Only reach here if metrics.json doesn't exist or is invalid - # For baseline, just return "ERR" - if method_name == "baseline_zeroshot": - return "ERR" - - # For non-baseline methods, provide more specific error messages - run_dir = os.path.dirname(metrics_path) - - # Check for time_taken.txt - time_taken_path = os.path.join(run_dir, "time_taken.txt") - if not os.path.exists(time_taken_path): - return "not avl." - - # Check for final_model subdirectory - final_model_path = os.path.join(run_dir, "final_model") - if not os.path.isdir(final_model_path): - return "not stored" - - # All checks passed but still no valid metrics.json - return "ERR" - -def process_method(method_path: str, method_name: str, min_run_id=None, max_run_id=None): - """ - For a single method dir (results/method_name), collect the newest run per - (benchmark, model), then write a CSV. - """ - # key: (benchmark, model) -> value: {"run_id": int, "path": str} - latest_runs = {} - - for entry in os.listdir(method_path): - entry_path = os.path.join(method_path, entry) - if not os.path.isdir(entry_path): - continue - - try: - benchmark, _, model, run_id_str = entry.split("_") - run_id = int(run_id_str) - key = (benchmark, model) - except ValueError as e: - print(entry) - raise ValueError(f"{entry}, {method_path}") - - if max_run_id is not None and run_id >= max_run_id: - continue - - if min_run_id is not None and run_id < min_run_id: - continue - - # keep only highest run_id per (benchmark, model) - if key not in latest_runs or run_id > latest_runs[key]["run_id"]: - latest_runs[key] = { - "run_id": run_id, - "path": entry_path, - } - - if not latest_runs: - # nothing to do for this method - return - - # Collect distinct benchmarks and models - benchmarks = sorted({b for (b, m) in latest_runs.keys()}) - models = sorted({m for (b, m) in latest_runs.keys()}) - - # Prepare CSV path (next to results/ or inside results/) - csv_path = os.path.join(get_results_dir(), f"{OUTPUT_PREFIX}{method_name}.csv") - - with open(csv_path, "w", newline="") as csvfile: - writer = csv.writer(csvfile) - # header - writer.writerow(["model"] + benchmarks) - - # rows - for model in models: - row = [model] - for bench in benchmarks: - cell = "" - key = (bench, model) - if key in latest_runs: - run_dir = latest_runs[key]["path"] - metrics_path = os.path.join(run_dir, "metrics.json") - cell = load_metrics(metrics_path, method_name) - row.append(cell) - writer.writerow(row) - - print(f"Written: {csv_path}") - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results') - -def parse_args(): - parser = argparse.ArgumentParser(description="Aggregate latest benchmark runs into CSV files.") - parser.add_argument( - "--min-run-id", - type=int, - default=None, - help="Inclusive lower bound for run ids to consider.", - ) - parser.add_argument( - "--max-run-id", - type=int, - default=None, - help="Exclusive upper bound for run ids to consider.", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - results_dir = get_results_dir() - - for method_name in os.listdir(results_dir): - method_path = os.path.join(results_dir, method_name) - if not os.path.isdir(method_path): - continue - - # treat every subdirectory of results/ as a "method" - process_method(method_path, method_name, min_run_id=args.min_run_id, max_run_id=args.max_run_id) - -if __name__ == "__main__": - main() diff --git a/scripts/aggregate_metrics_runs.py b/scripts/aggregate_metrics_runs.py new file mode 100755 index 00000000..26ef7981 --- /dev/null +++ b/scripts/aggregate_metrics_runs.py @@ -0,0 +1,72 @@ +#!/usr/bin/env python3 +"""Aggregate per-run metrics JSON files into a single metrics_averaged.json. + +Reads every file matching --runs-glob, treats top-level numeric keys as +per-run metric values, and writes mean/std/stderr/min/max per key plus the +raw per-run records and source file list. +""" +from __future__ import annotations + +import argparse +import glob +import json +import math +import sys + + +def _numeric(x: object) -> bool: + return isinstance(x, (int, float)) and not isinstance(x, bool) + + +def main() -> None: + parser = argparse.ArgumentParser() + parser.add_argument("--runs-glob", required=True, + help="Glob matching per-run metrics JSON files.") + parser.add_argument("--output", required=True, + help="Path to write the aggregated metrics JSON.") + args = parser.parse_args() + + paths = sorted(glob.glob(args.runs_glob)) + if not paths: + sys.exit(f"no run files matched {args.runs_glob}") + + runs: list[dict] = [] + for path in paths: + with open(path, "r") as f: + runs.append(json.load(f)) + + keys = sorted({k for r in runs for k in r.keys()}) + + aggregated: dict[str, dict[str, float | int]] = {} + for k in keys: + vals = [r[k] for r in runs if k in r and _numeric(r[k])] + if not vals: + continue + mean = sum(vals) / len(vals) + if len(vals) > 1: + variance = sum((x - mean) ** 2 for x in vals) / (len(vals) - 1) + std = math.sqrt(variance) + else: + std = 0.0 + aggregated[k] = { + "mean": mean, + "std": std, + "stderr": std / math.sqrt(len(vals)), + "min": min(vals), + "max": max(vals), + "n": len(vals), + } + + out = { + "n_runs": len(runs), + "metrics": aggregated, + "per_run": runs, + "run_files": paths, + } + + with open(args.output, "w") as f: + json.dump(out, f, indent=2) + + +if __name__ == "__main__": + main() diff --git a/scripts/aggregate_summary.py b/scripts/aggregate_summary.py deleted file mode 100644 index 169e30fa..00000000 --- a/scripts/aggregate_summary.py +++ /dev/null @@ -1,191 +0,0 @@ -#!/usr/bin/env python3 -""" -Aggregate final_{method}.csv files into a single summary CSV. - -For each benchmark, computes the average score across models per method. - -Output format: -- Rows: benchmarks -- Columns: baseline_base, baseline_instruct, method1, method2, ... -- Values: average score across models -""" -import os -import csv -import argparse - -METHOD_NAME_MAP = { - "claude_claude-sonnet-4-5": "claude sonnet 4.5", - "claude_claude-opus-4-5": "claude opus 4.5", - "codex_gpt-5.1-codex-max": "gpt-5.1-codex-max", - "codex_gpt-5.2": "gpt-5.2", - "gemini_models_gemini-3-pro-preview": "gemini-3-pro", - "opencode_anthropic_claude-sonnet-4-5": "opencode claude-sonnet-4-5", - "opencode_anthropic_claude-opus-4-5_10h": "opencode claude-opus-4-5", - "opencode_opencode_big-pickle_10h": "opencode big-pickle", - "opencode_opencode_gemini-3-pro_10h": "opencode gemini-3-pro", - "opencode_opencode_glm-4.7-free_10h": "opencode glm-4.7", - "opencode_opencode_gpt-5.1-codex-max_10h": "opencode gpt-5.1-codex-max", - "opencode_opencode_kimi-k2-thinking_10h": "opencode kimi-k2-thinking", - "opencode_opencode_minimax-m2.1-free_10h": "opencode minimax-m2.1", -} - -# Model groups for baseline columns -BASE_MODELS = ["Qwen3-1.7B-Base", "Qwen3-4B-Base", "SmolLM3-3B-Base", "gemma-3-4b-pt"] -INSTRUCT_MODELS = ["Qwen3-1.7B", "Qwen3-4B", "SmolLM3-3B", "gemma-3-4b-it"] - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def load_csv_as_dict(csv_path: str) -> tuple[dict, list]: - """ - Load a CSV file into a dict of dicts: {model: {benchmark: value}}. - Returns (data_dict, list_of_benchmarks). - """ - data = {} - benchmarks = [] - - if not os.path.exists(csv_path): - return data, benchmarks - - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader, None) - if not header: - return data, benchmarks - - benchmarks = header[1:] - - for row in reader: - if not row: - continue - model = row[0] - data[model] = {} - for i, bench in enumerate(benchmarks): - if i + 1 < len(row): - data[model][bench] = row[i + 1] - else: - data[model][bench] = "" - - return data, benchmarks - - -def compute_benchmark_average(data: dict, benchmark: str, models: list = None) -> str: - """ - Compute average score for a benchmark across specified models. - If models is None, uses all models in data. - Returns empty string if no valid scores found. - """ - if models is None: - models = list(data.keys()) - - values = [] - for model in models: - val_str = data.get(model, {}).get(benchmark, "") - if val_str: - try: - values.append(float(val_str)) - except ValueError: - pass - - if not values: - return "" - - return f"{sum(values) / len(values):.4f}" - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Aggregate final CSVs into a single summary with model averages per benchmark." - ) - parser.add_argument( - "methods", - nargs="+", - help="List of methods to include in the aggregation.", - ) - parser.add_argument( - "-o", "--output", - default=None, - help="Output CSV filename. Default: summary.csv in results dir.", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - results_dir = get_results_dir() - - # Load baseline data - baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv") - baseline_data, baseline_benchmarks = load_csv_as_dict(baseline_path) - - if not baseline_data: - print(f"Error: No baseline data found at {baseline_path}") - return - - # Load all method data - method_data = {} - method_benchmarks = {} - - for method in args.methods: - final_path = os.path.join(results_dir, f"final_{method}.csv") - data, benchmarks = load_csv_as_dict(final_path) - - if not data: - print(f"Warning: No data found for method '{method}' at {final_path}") - continue - - method_data[method] = data - method_benchmarks[method] = set(benchmarks) - - if not method_data: - print("Error: No valid method data found.") - return - - # Find common benchmarks across baseline and all methods - common_benchmarks = set(baseline_benchmarks) - for method, benchmarks in method_benchmarks.items(): - common_benchmarks &= benchmarks - - common_benchmarks = sorted(common_benchmarks) - - if not common_benchmarks: - print("Error: No common benchmarks found across all files.") - return - - print(f"Common benchmarks ({len(common_benchmarks)}): {', '.join(common_benchmarks)}") - - # Prepare output - output_path = args.output or os.path.join(results_dir, "summary.csv") - methods_ordered = [m for m in args.methods if m in method_data] - - with open(output_path, "w", newline="") as f: - writer = csv.writer(f) - - # Header: benchmark, baseline_base, baseline_instruct, method1, method2, ... - # Apply METHOD_NAME_MAP to simplify method names in the header - display_methods = [METHOD_NAME_MAP.get(m, m) for m in methods_ordered] - writer.writerow(["benchmark", "baseline_base", "baseline_instruct"] + display_methods) - - # Benchmark rows - for bench in common_benchmarks: - row = [bench] - - # Baseline base models average - row.append(compute_benchmark_average(baseline_data, bench, BASE_MODELS)) - - # Baseline instruct models average - row.append(compute_benchmark_average(baseline_data, bench, INSTRUCT_MODELS)) - - # Method averages (over all models in each method's file) - for method in methods_ordered: - row.append(compute_benchmark_average(method_data[method], bench)) - - writer.writerow(row) - - print(f"Written: {output_path}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/aggregate_time.py b/scripts/aggregate_time.py deleted file mode 100644 index 308276c0..00000000 --- a/scripts/aggregate_time.py +++ /dev/null @@ -1,196 +0,0 @@ -#!/usr/bin/env python3 -import argparse -import os -import csv -import re - -OUTPUT_PREFIX = "aggregated_time_" # e.g. aggregated_time_method.csv -OVERVIEW_FILENAME = "aggregated_time_overview.csv" -BUDGET_SECONDS = 10 * 3600 # 10 hours - - -def parse_time_hms(time_str: str) -> int | None: - """ - Parse a time string in H:M:S format and return total seconds. - Returns None if parsing fails. - """ - match = re.match(r'^(\d+):(\d{1,2}):(\d{1,2})$', time_str.strip()) - if not match: - return None - hours, minutes, seconds = map(int, match.groups()) - if minutes >= 60 or seconds >= 60: - return None - return hours * 3600 + minutes * 60 + seconds - - -def format_time_hms(total_seconds: int) -> str: - """Convert total seconds to H:M:S format.""" - hours = total_seconds // 3600 - minutes = (total_seconds % 3600) // 60 - seconds = total_seconds % 60 - return f"{hours}:{minutes:02d}:{seconds:02d}" - - -def load_time_taken(run_dir: str) -> tuple[str, int | None]: - """ - Return the time taken as (display_string, total_seconds). - - Returns (H:M:S string, seconds) if valid - - Returns ("ERR", None) if time_taken.txt doesn't exist or is invalid - """ - time_taken_path = os.path.join(run_dir, "time_taken.txt") - - if not os.path.exists(time_taken_path): - return "ERR", None - - try: - with open(time_taken_path, "r") as f: - time_str = f.read().strip() - total_seconds = parse_time_hms(time_str) - if total_seconds is None: - return "ERR", None - return format_time_hms(total_seconds), total_seconds - except Exception: - return "ERR", None - - -def process_method(method_path: str, method_name: str, min_run_id=None, max_run_id=None) -> dict: - """ - For a single method dir (results/method_name), collect the newest run per - (benchmark, model), then write a CSV. - - Returns a dict with timing statistics for the overview. - """ - # key: (benchmark, model) -> value: {"run_id": int, "path": str} - latest_runs = {} - - for entry in os.listdir(method_path): - entry_path = os.path.join(method_path, entry) - if not os.path.isdir(entry_path): - continue - - try: - benchmark, _, model, run_id_str = entry.split("_") - run_id = int(run_id_str) - key = (benchmark, model) - except ValueError as e: - print(entry) - raise ValueError(f"{entry}, {method_path}") - - if max_run_id is not None and run_id >= max_run_id: - continue - if min_run_id is not None and run_id < min_run_id: - continue - - # keep only highest run_id per (benchmark, model) - if key not in latest_runs or run_id > latest_runs[key]["run_id"]: - latest_runs[key] = { - "run_id": run_id, - "path": entry_path, - } - - if not latest_runs: - return {} - - benchmarks = sorted({b for (b, m) in latest_runs.keys()}) - models = sorted({m for (b, m) in latest_runs.keys()}) - - csv_path = os.path.join(get_results_dir(), f"{OUTPUT_PREFIX}{method_name}.csv") - - # Collect timing stats for overview - total_seconds = 0 - valid_count = 0 - - with open(csv_path, "w", newline="") as csvfile: - writer = csv.writer(csvfile) - writer.writerow(["model"] + benchmarks) - - for model in models: - row = [model] - for bench in benchmarks: - cell = "" - key = (bench, model) - if key in latest_runs: - run_dir = latest_runs[key]["path"] - cell, seconds = load_time_taken(run_dir) - if seconds is not None: - total_seconds += seconds - valid_count += 1 - row.append(cell) - writer.writerow(row) - print(f"Written: {csv_path}") - - return { - "total_seconds": total_seconds, - "valid_count": valid_count, - } - - -def write_overview(method_stats: dict[str, dict]): - """Write an overview CSV with average times per method.""" - csv_path = os.path.join(get_results_dir(), OVERVIEW_FILENAME) - - with open(csv_path, "w", newline="") as csvfile: - writer = csv.writer(csvfile) - writer.writerow(["method", "average_time", "percentage"]) - - for method_name in sorted(method_stats.keys()): - stats = method_stats[method_name] - total_secs = stats["total_seconds"] - valid = stats["valid_count"] - - if valid > 0: - avg_secs = total_secs // valid - avg_str = format_time_hms(avg_secs) - pct = (avg_secs / BUDGET_SECONDS) * 100 - pct_str = f"{pct:.1f}%" - else: - avg_str = "N/A" - pct_str = "N/A" - - writer.writerow([method_name, avg_str, pct_str]) - - print(f"Written: {csv_path}") - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results') - - -def parse_args(): - parser = argparse.ArgumentParser(description="Aggregate latest benchmark run times into CSV files.") - parser.add_argument( - "--min-run-id", - type=int, - default=None, - help="Inclusive lower bound for run ids to consider.", - ) - parser.add_argument( - "--max-run-id", - type=int, - default=None, - help="Exclusive upper bound for run ids to consider.", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - results_dir = get_results_dir() - - method_stats = {} - - for method_name in os.listdir(results_dir): - method_path = os.path.join(results_dir, method_name) - if not os.path.isdir(method_path): - continue - - stats = process_method(method_path, method_name, min_run_id=args.min_run_id, max_run_id=args.max_run_id) - if stats: - method_stats[method_name] = stats - - if method_stats: - write_overview(method_stats) - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/aggregate_time_avg_stddev.py b/scripts/aggregate_time_avg_stddev.py deleted file mode 100755 index 59363bb8..00000000 --- a/scripts/aggregate_time_avg_stddev.py +++ /dev/null @@ -1,101 +0,0 @@ -#!/usr/bin/env python3 -""" -Compute averages and standard deviations for time taken across multiple runs. - -Reads from aggregated_time_overview.csv and computes statistics for each agent -group defined in HARDCODED_AGENT_MAP. -""" -import os -import csv -import math - -from constants import HARDCODED_AGENT_MAP - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def mean(values: list[float]) -> float: - return sum(values) / len(values) - - -def stddev(values: list[float]) -> float: - avg = mean(values) - variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1) - return math.sqrt(variance) - - -def parse_time_to_hours(time_str: str) -> float: - """Parse time string like '8:17:28' or '0:55:17' to hours as float.""" - parts = time_str.split(":") - hours = int(parts[0]) - minutes = int(parts[1]) - seconds = int(parts[2]) - return hours + minutes / 60 + seconds / 3600 - - -def load_time_csv(csv_path: str) -> dict[str, float]: - """ - Load time CSV into dict: {method: hours}. - """ - data = {} - - with open(csv_path, "r", newline="") as f: - reader = csv.DictReader(f) - for row in reader: - method = row["method"] - data[method] = parse_time_to_hours(row["average_time"]) - - return data - - -def format_hours_to_time(hours: float) -> str: - """Convert hours float back to H:MM:SS format.""" - total_seconds = int(hours * 3600) - h = total_seconds // 3600 - m = (total_seconds % 3600) // 60 - s = total_seconds % 60 - return f"{h}:{m:02d}:{s:02d}" - - -def main(): - results_dir = get_results_dir() - time_csv_path = os.path.join(results_dir, "aggregated_time_overview.csv") - - # Load time data - time_data = load_time_csv(time_csv_path) - - # Compute aggregated statistics for each agent group - aggregated_results = {} - - for agent_name, method_names in HARDCODED_AGENT_MAP.items(): - hours_list = [] - - for method_name in method_names: - hours_list.append(time_data[method_name]) - - aggregated_results[agent_name] = { - "avg_hours": mean(hours_list), - "std_hours": stddev(hours_list), - "n": len(hours_list), - } - - # Write aggregated time CSV - output_path = os.path.join(results_dir, "time_aggregated.csv") - with open(output_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["agent", "avg_time", "std_time", "n"]) - for agent_name in HARDCODED_AGENT_MAP.keys(): - data = aggregated_results[agent_name] - writer.writerow([ - agent_name, - format_hours_to_time(data["avg_hours"]), - format_hours_to_time(data["std_hours"]), - data["n"], - ]) - print(f"Written: {output_path}") - - -if __name__ == "__main__": - main() diff --git a/scripts/aggregate_time_baselines.py b/scripts/aggregate_time_baselines.py deleted file mode 100644 index 38eec5ec..00000000 --- a/scripts/aggregate_time_baselines.py +++ /dev/null @@ -1,128 +0,0 @@ -import os -import csv -from pathlib import Path -from collections import defaultdict - -def parse_directory_name(dirname): - """Extract benchmark and model from directory name.""" - parts = dirname.split('_') - - benchmarks = [] - benchmarks_path = Path('src/eval/tasks') - for item in benchmarks_path.iterdir(): - benchmarks.append(item.name) - - for benchmark in benchmarks: - prefix = benchmark + '_' - if dirname.startswith(prefix): - rest = dirname[len(prefix):] - break - else: - return None, None, None - - # Extract model name and run ID - parts = rest.rsplit('_', 1) - if len(parts) == 2: - model = parts[0].replace('_', '/') # Convert back to org/model format - run_id = parts[1] - return benchmark, model, run_id - - return None, None, None - -def get_latest_results(results_dir): - """Get the latest run for each model-benchmark combination.""" - results_path = Path(results_dir) - - # Store all runs grouped by (benchmark, model) - runs = defaultdict(list) - - # Scan all directories - for subdir in results_path.glob('baseline_zeroshot/*'): - if subdir.is_dir(): - benchmark, model, run_id = parse_directory_name(subdir.name) - - if benchmark and model and run_id: - time_file = subdir / 'time_taken.txt' - runs[(benchmark, model)].append({ - 'path': time_file, - 'run_id': run_id, - 'dir_name': subdir.name, - 'exists': time_file.exists() - }) - - # Get the latest run for each combination (highest run_id) - latest_runs = {} - for (benchmark, model), run_list in runs.items(): - latest = max(run_list, key=lambda x: x['run_id']) - latest_runs[(benchmark, model)] = { - 'path': latest['path'], - 'exists': latest['exists'] - } - - return latest_runs - -def read_time_taken(time_path): - """Read time_taken.txt and return the time string, removing leading zeros.""" - with open(time_path, 'r') as f: - time_str = f.read().strip() - - # If format is HH:MM:SS and hours are 00, strip to MM:SS - if ':' in time_str: - parts = time_str.split(':') - if len(parts) == 3 and parts[0] == '00': - time_str = f"{parts[1]}:{parts[2]}" - - return time_str - -def create_results_csv(results_dir, output_file='benchmark_times.csv'): - """Create a CSV file with aggregated time taken results.""" - # Get latest results - latest_runs = get_latest_results(results_dir) - - # Collect all unique benchmarks and models - benchmarks = sorted(set(b for b, m in latest_runs.keys())) - models = sorted(set(m for b, m in latest_runs.keys())) - - # Create results matrix - results = {} - for model in models: - results[model] = {} - for benchmark in benchmarks: - if (benchmark, model) in latest_runs: - run_info = latest_runs[(benchmark, model)] - if run_info['exists']: - results[model][benchmark] = read_time_taken(str(run_info['path'])) - else: - results[model][benchmark] = 'ERR' - else: - results[model][benchmark] = 'N/A' - - # Write to CSV - with open(output_file, 'w', newline='') as f: - writer = csv.writer(f) - - # Header row - writer.writerow(['Model'] + benchmarks) - - # Data rows - prefix times with single quote to force string interpretation - for model in models: - row = [model] + [f"'{results[model][b]}" for b in benchmarks] - writer.writerow(row) - - print(f"Results saved to {output_file}") - print(f"Models: {len(models)}") - print(f"Benchmarks: {len(benchmarks)}") - print(f"Total entries: {len(latest_runs)}") - - # Count ERR entries - err_count = sum(1 for model in models for b in benchmarks if results[model][b] == 'ERR') - if err_count > 0: - print(f"Warning: {err_count} entries missing time_taken.txt (marked as ERR)") - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", 'results') - -if __name__ == '__main__': - results_dir = get_results_dir() - - create_results_csv(results_dir, os.path.join(results_dir, 'benchmark_times.csv')) \ No newline at end of file diff --git a/scripts/aggregate_together.py b/scripts/aggregate_together.py deleted file mode 100644 index 7b167946..00000000 --- a/scripts/aggregate_together.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python3 -""" -Aggregate final_{method}.csv files into a single concatenated CSV. - -Output format: -- Method name row -- Header row (model, benchmark1, benchmark2, ...) -- Data rows for that method -- Blank line -- Next method... -""" -import os -import csv -import argparse - -METHOD_NAME_MAP = { - "claude_claude-sonnet-4-5_final_v3": "claude sonnet 4.5", - "claude_claude-opus-4-5_final_v3": "claude opus 4.5", - "codex_gpt-5.1-codex-max_final_v3": "gpt-5.1-codex-max", - "codex_gpt-5.2_final_v3": "gpt-5.2", - "gemini_models_gemini-3-pro-preview_final_v3": "gemini-3-pro", -} - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def load_csv_rows(csv_path: str) -> tuple[list, list]: - """ - Load a CSV file and return (header, rows). - """ - header = [] - rows = [] - - if not os.path.exists(csv_path): - return header, rows - - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader, None) - if not header: - return [], [] - - for row in reader: - if row: - rows.append(row) - - return header, rows - - -def parse_args(): - parser = argparse.ArgumentParser( - description="Aggregate final CSVs into a single concatenated file." - ) - parser.add_argument( - "methods", - nargs="+", - help="List of methods to include in the aggregation.", - ) - parser.add_argument( - "-o", "--output", - default=None, - help="Output CSV filename. Default: summary_concat.csv in results dir.", - ) - parser.add_argument( - "--include-baseline", - action="store_true", - help="Include baseline data as the first section.", - ) - return parser.parse_args() - - -def main(): - args = parse_args() - results_dir = get_results_dir() - - output_path = args.output or os.path.join(results_dir, "summary_concat.csv") - - with open(output_path, "w", newline="") as f: - writer = csv.writer(f) - - # Optionally include baseline first - if args.include_baseline: - baseline_path = os.path.join(results_dir, "aggregated_baseline_zeroshot.csv") - header, rows = load_csv_rows(baseline_path) - - if header and rows: - writer.writerow(["baseline"]) - writer.writerow(header) - for row in rows: - writer.writerow(row) - writer.writerow([]) # blank line - else: - print(f"Warning: No baseline data found at {baseline_path}") - - # Process each method - for method in args.methods: - final_path = os.path.join(results_dir, f"final_{method}.csv") - header, rows = load_csv_rows(final_path) - - if not header or not rows: - print(f"Warning: No data found for method '{method}' at {final_path}") - continue - - display_name = METHOD_NAME_MAP.get(method, method) - - # Method name row - writer.writerow([display_name]) - # Header row - writer.writerow(header) - # Data rows - for row in rows: - writer.writerow(row) - # Blank line - writer.writerow([]) - - print(f"Written: {output_path}") - - -if __name__ == "__main__": - main() \ No newline at end of file diff --git a/scripts/baselines.json b/scripts/baselines.json new file mode 100644 index 00000000..d80799c3 --- /dev/null +++ b/scripts/baselines.json @@ -0,0 +1,114 @@ +{ + "zeroshot": { + "Qwen3-1.7B": { + "aime2025": 0.26666666666666666, + "arenahardwriting": 0.5, + "bfcl": 0.94, + "gpqamain": 0.3549107142857143, + "gsm8k": 0.8847611827141774, + "healthbench": 0.44918867035528026, + "humaneval": 0.6890243902439024 + }, + "Qwen3-1.7B-Base": { + "aime2025": 0.0, + "arenahardwriting": 0.009142053445850914, + "bfcl": 0.0, + "gpqamain": 0.140625, + "gsm8k": 0.12661106899166036, + "healthbench": 0.07537565969807473, + "humaneval": 0.07926829268292683 + }, + "Qwen3-4B": { + "aime2025": 0.5333333333333333, + "arenahardwriting": 0.8683943089430894, + "bfcl": 0.95, + "gpqamain": 0.44642857142857145, + "gsm8k": 0.9378316906747536, + "healthbench": 0.5272399437524256, + "humaneval": 0.774390243902439 + }, + "Qwen3-4B-Base": { + "aime2025": 0.03333333333333333, + "arenahardwriting": 0.03417533432392273, + "bfcl": 0.0, + "gpqamain": 0.13392857142857142, + "gsm8k": 0.4184988627748294, + "healthbench": 0.13383521639663787, + "humaneval": 0.36585365853658536 + }, + "SmolLM3-3B": { + "aime2025": 0.26666666666666666, + "arenahardwriting": 0.492, + "bfcl": 0.84, + "gpqamain": 0.3325892857142857, + "gsm8k": 0.8218347232752085, + "healthbench": 0.2957717718639611, + "humaneval": 0.7012195121951219 + }, + "SmolLM3-3B-Base": { + "aime2025": 0.03333333333333333, + "arenahardwriting": 0.004225352112676056, + "bfcl": 0.0, + "gpqamain": 0.049107142857142856, + "gsm8k": 0.21076573161485973, + "healthbench": 0.0, + "humaneval": 0.06097560975609756 + }, + "gemma-3-4b-it": { + "aime2025": 0.1, + "arenahardwriting": 0.948, + "bfcl": 0.67, + "gpqamain": 0.31473214285714285, + "gsm8k": 0.8354814253222138, + "healthbench": 0.46063396051286026, + "humaneval": 0.6951219512195121 + }, + "gemma-3-4b-pt": { + "aime2025": 0.0, + "arenahardwriting": 0.0028530670470756064, + "bfcl": 0.06, + "gpqamain": 0.015625, + "gsm8k": 0.06141015921152388, + "healthbench": 0.17039403723633986, + "humaneval": 0.006097560975609756 + } + }, + "fewshot": { + "Qwen3-1.7B-Base": { + "aime2025": 0.05333333333333333, + "arenahardwriting": 0.05314625850340136, + "bfcl": 0.0, + "gpqamain": 0.25959821428571417, + "gsm8k": 0.46679302501895537, + "healthbench": 0.2110110691560308, + "humaneval": 0.25243902439024396 + }, + "Qwen3-4B-Base": { + "aime2025": 0.09000000000000001, + "arenahardwriting": 0.19168260038240917, + "bfcl": 0.0, + "gpqamain": 0.29888392857142837, + "gsm8k": 0.7438210765731573, + "healthbench": 0.2179351466647625, + "humaneval": 0.6774390243902438 + }, + "SmolLM3-3B-Base": { + "aime2025": 0.06000000000000001, + "arenahardwriting": 0.03248811410459588, + "bfcl": 0.0, + "gpqamain": 0.13236607142857182, + "gsm8k": 0.5298711144806676, + "healthbench": 0.10165123092180756, + "humaneval": 0.3237804878048783 + }, + "gemma-3-4b-pt": { + "aime2025": 0.0, + "arenahardwriting": 0.01257396449704142, + "bfcl": 0.06699999999999998, + "gpqamain": 0.21406249999999985, + "gsm8k": 0.0583775587566339, + "healthbench": 0.23317845064882012, + "humaneval": 0.004878048780487805 + } + } +} diff --git a/scripts/collect.py b/scripts/collect.py new file mode 100644 index 00000000..fa366ece --- /dev/null +++ b/scripts/collect.py @@ -0,0 +1,259 @@ +#!/usr/bin/env python3 +""" +Collect results from raw run directories into per-method CSVs. + +For each method directory in the results dir, does a single pass: + 1. Finds the latest run per (benchmark, model) + 2. Reads metrics.json, contamination files, and time_taken.txt + 3. Applies baseline fallback for contaminated or errored cells + 4. Writes final_{method}.csv, contamination_{method}.csv + +Also writes a time_overview.csv summarising average time per method. + +Usage: + python collect.py + python collect.py --data-dir /path/to/results --output-dir /path/to/output + python collect.py --min-run-id 100 --max-run-id 200 +""" +import argparse +import csv +import os + +from utils import ( + get_results_dir, + get_baseline_fallback_data, + walk_latest_runs, + load_metrics, + load_contamination, + load_disallowed_model, + combine_contamination_results, + load_time_taken, + is_number, + format_time_hms, + BUDGET_SECONDS, +) + +# Directories to skip (baselines are hardcoded in baselines.json) +SKIP_METHODS = {"baseline", "baseline_zeroshot"} + + +def collect_method( + method_path: str, + method_name: str, + baseline_data: dict[str, dict[str, str]], + output_dir: str, + min_run_id: int | None = None, + max_run_id: int | None = None, +) -> dict | None: + """ + Collect results for one method directory. + + Writes: + - final_{method_name}.csv (scores with baseline fallback) + - contamination_{method_name}.csv (contamination flags) + + Returns time stats dict {"total_seconds": int, "valid_count": int} + or None if no runs found. + """ + latest_runs = walk_latest_runs(method_path, min_run_id, max_run_id) + if not latest_runs: + return None + + benchmarks = sorted({b for b, m in latest_runs}) + models = sorted({m for b, m in latest_runs}) + + # Collect metrics, contamination, and time in one pass + metrics_grid = {} # {model: {bench: str}} + contamination_grid = {} # {model: {bench: str}} + time_total_seconds = 0 + time_valid_count = 0 + + for model in models: + metrics_grid[model] = {} + contamination_grid[model] = {} + + for bench in benchmarks: + key = (bench, model) + if key not in latest_runs: + metrics_grid[model][bench] = "" + contamination_grid[model][bench] = "" + continue + + run_dir = latest_runs[key]["path"] + + # Metrics + metrics_path = os.path.join(run_dir, "metrics.json") + metrics_grid[model][bench] = load_metrics(metrics_path, method_name) + + # Contamination + contamination = load_contamination( + os.path.join(run_dir, "contamination_judgement.txt") + ) + disallowed = load_disallowed_model( + os.path.join(run_dir, "disallowed_model_judgement.txt") + ) + contamination_grid[model][bench] = combine_contamination_results( + contamination, disallowed + ) + + # Time + _, seconds = load_time_taken(run_dir) + if seconds is not None: + time_total_seconds += seconds + time_valid_count += 1 + + # Write contamination CSV + contamination_path = os.path.join( + output_dir, f"contamination_{method_name}.csv" + ) + with open(contamination_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["model"] + benchmarks) + for model in models: + row = [model] + for bench in benchmarks: + row.append(contamination_grid[model][bench]) + writer.writerow(row) + print(f"Written: {contamination_path}") + + # Apply baseline fallback: replace cell with baseline if + # (a) value is not a number, OR + # (b) contamination flag is non-empty + for model in models: + for bench in benchmarks: + value = metrics_grid[model][bench] + contamination_value = contamination_grid[model][bench] + + reasons = [] + if not is_number(value): + reasons.append(f"non-numeric value ({value!r})") + if contamination_value.strip(): + reasons.append( + f"contamination flag ({contamination_value.strip()!r})" + ) + + if not reasons: + continue + + if model not in baseline_data or bench not in baseline_data[model]: + raise KeyError( + f"baselines.json missing entry for model={model!r} " + f"benchmark={bench!r}; needed as fallback in method " + f"{method_name!r} (triggered by {', '.join(reasons)})" + ) + metrics_grid[model][bench] = baseline_data[model][bench] + + # Write final CSV (scores with baseline fallback applied) + final_path = os.path.join(output_dir, f"final_{method_name}.csv") + with open(final_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["model"] + benchmarks) + for model in models: + row = [model] + for bench in benchmarks: + row.append(metrics_grid[model].get(bench, "")) + writer.writerow(row) + print(f"Written: {final_path}") + + return { + "total_seconds": time_total_seconds, + "valid_count": time_valid_count, + } + + +def write_time_overview(method_stats: dict[str, dict], output_dir: str): + """Write time_overview.csv with average time per method.""" + csv_path = os.path.join(output_dir, "time_overview.csv") + + with open(csv_path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["method", "average_time", "percentage"]) + + for method_name in sorted(method_stats.keys()): + stats = method_stats[method_name] + total_secs = stats["total_seconds"] + valid = stats["valid_count"] + + if valid > 0: + avg_secs = total_secs // valid + avg_str = format_time_hms(avg_secs) + pct = (avg_secs / BUDGET_SECONDS) * 100 + pct_str = f"{pct:.1f}%" + else: + avg_str = "N/A" + pct_str = "N/A" + + writer.writerow([method_name, avg_str, pct_str]) + + print(f"Written: {csv_path}") + + +def parse_args(): + parser = argparse.ArgumentParser( + description="Collect raw results into per-method CSVs." + ) + parser.add_argument( + "--data-dir", + default=None, + help="Directory containing method subdirectories with raw run data. " + "Defaults to POST_TRAIN_BENCH_RESULTS_DIR or 'results'.", + ) + parser.add_argument( + "--output-dir", + default=None, + help="Directory to write output CSVs. Defaults to same as --data-dir.", + ) + parser.add_argument( + "--min-run-id", + type=int, + default=None, + help="Inclusive lower bound for run IDs to consider.", + ) + parser.add_argument( + "--max-run-id", + type=int, + default=None, + help="Exclusive upper bound for run IDs to consider.", + ) + return parser.parse_args() + + +def main(): + args = parse_args() + + data_dir = args.data_dir or get_results_dir() + output_dir = args.output_dir or data_dir + + os.makedirs(output_dir, exist_ok=True) + + # Load baseline data for fallback (hardcoded in baselines.json) + baseline_data = get_baseline_fallback_data() + + method_stats = {} + + for method_name in sorted(os.listdir(data_dir)): + method_path = os.path.join(data_dir, method_name) + if not os.path.isdir(method_path): + continue + + # Skip baseline directories — their values are hardcoded + if method_name in SKIP_METHODS: + continue + + stats = collect_method( + method_path, + method_name, + baseline_data, + output_dir, + min_run_id=args.min_run_id, + max_run_id=args.max_run_id, + ) + if stats: + method_stats[method_name] = stats + + if method_stats: + write_time_overview(method_stats, output_dir) + + +if __name__ == "__main__": + main() diff --git a/scripts/compute_baseline_metrics.py b/scripts/compute_baseline_metrics.py deleted file mode 100755 index ee31e81a..00000000 --- a/scripts/compute_baseline_metrics.py +++ /dev/null @@ -1,121 +0,0 @@ -#!/usr/bin/env python3 -""" -Compute metrics for baseline models (base and instruct-tuned) using factors from factors.json. - -Reads aggregated_baseline.csv and computes weighted metrics per model. -""" -import os -import csv -import json -import argparse - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json") - -# Mapping from CSV model names to factors.json model names -MODEL_NAME_MAPPING = { - # Base/pretrained models - "Qwen3-1.7B-Base": "Qwen3-1.7B", - "Qwen3-4B-Base": "Qwen3-4B", - "SmolLM3-3B-Base": "SmolLM3-3B", - "gemma-3-4b-pt": "gemma-3-4b", - # Instruct-tuned models - "Qwen3-1.7B": "Qwen3-1.7B", - "Qwen3-4B": "Qwen3-4B", - "SmolLM3-3B": "SmolLM3-3B", - "gemma-3-4b-it": "gemma-3-4b", -} - -BASE_MODELS = {"Qwen3-1.7B-Base", "Qwen3-4B-Base", "SmolLM3-3B-Base", "gemma-3-4b-pt"} -INSTRUCT_MODELS = {"Qwen3-1.7B", "Qwen3-4B", "SmolLM3-3B", "gemma-3-4b-it"} - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def load_factors(factors_path: str) -> dict: - """Load factors from JSON file.""" - with open(factors_path, "r") as f: - return json.load(f) - - -def load_baseline_csv(csv_path: str) -> tuple[dict, list]: - """ - Load baseline CSV file into a dict: {model: {benchmark: value}}. - Returns (data_dict, list_of_benchmarks). - """ - data = {} - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader) - benchmarks = header[1:] - - for row in reader: - model = row[0] - data[model] = {} - for i, bench in enumerate(benchmarks): - data[model][bench] = float(row[i + 1]) - - return data, benchmarks - - -def compute_metric(model_data: dict, factors: dict, benchmarks: list) -> float: - """Compute weighted sum of benchmark values using factors.""" - total = 0.0 - for bench in benchmarks: - value = model_data[bench] - factor = factors[bench] - total += value * factor - return total - - -def main(): - parser = argparse.ArgumentParser( - description="Compute baseline metrics using per-model factors." - ) - parser.add_argument( - "--output", - default=None, - help="Output CSV path. Defaults to baseline_metrics.csv in results dir.", - ) - args = parser.parse_args() - - results_dir = get_results_dir() - factors = load_factors(FACTORS_PATH) - - csv_path = os.path.join(results_dir, "aggregated_baseline.csv") - data, benchmarks = load_baseline_csv(csv_path) - - # Compute metrics for each model - base_results = {} - instruct_results = {} - - for csv_model in data: - factors_model = MODEL_NAME_MAPPING[csv_model] - model_factors = factors[factors_model] - metric = compute_metric(data[csv_model], model_factors, benchmarks) - - if csv_model in BASE_MODELS: - base_results[csv_model] = metric - else: - instruct_results[csv_model] = metric - - # Write output table - output_path = args.output or os.path.join(results_dir, "baseline_metrics.csv") - - with open(output_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["model_type", "model", "metric"]) - - for model in sorted(base_results.keys()): - writer.writerow(["base", model, base_results[model]]) - - for model in sorted(instruct_results.keys()): - writer.writerow(["instruct", model, instruct_results[model]]) - - print(f"Written: {output_path}") - - -if __name__ == "__main__": - main() diff --git a/scripts/compute_baseline_metrics_by_benchmark.py b/scripts/compute_baseline_metrics_by_benchmark.py deleted file mode 100755 index e726c836..00000000 --- a/scripts/compute_baseline_metrics_by_benchmark.py +++ /dev/null @@ -1,97 +0,0 @@ -#!/usr/bin/env python3 -""" -Compute metrics for baseline models (base and instruct-tuned) using factors from factors_by_benchmark.json. - -Reads aggregated_baseline.csv and computes averaged metrics for base and instruct groups. -""" -import os -import csv -import json -import argparse - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json") - -BASE_MODELS = {"Qwen3-1.7B-Base", "Qwen3-4B-Base", "SmolLM3-3B-Base", "gemma-3-4b-pt"} -INSTRUCT_MODELS = {"Qwen3-1.7B", "Qwen3-4B", "SmolLM3-3B", "gemma-3-4b-it"} - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def load_factors(factors_path: str) -> dict: - """Load factors from JSON file.""" - with open(factors_path, "r") as f: - return json.load(f) - - -def load_baseline_csv(csv_path: str) -> tuple[dict, list]: - """ - Load baseline CSV file into a dict: {model: {benchmark: value}}. - Returns (data_dict, list_of_benchmarks). - """ - data = {} - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader) - benchmarks = header[1:] - - for row in reader: - model = row[0] - data[model] = {} - for i, bench in enumerate(benchmarks): - data[model][bench] = float(row[i + 1]) - - return data, benchmarks - - -def compute_metric_by_benchmark(data: dict, factors: dict, benchmarks: list, models: set) -> float: - """ - Compute weighted sum where each benchmark value is averaged across specified models. - """ - total = 0.0 - model_list = [m for m in data if m in models] - num_models = len(model_list) - for bench in benchmarks: - avg_value = sum(data[model][bench] for model in model_list) / num_models - factor = factors[bench] - total += avg_value * factor - return total - - -def main(): - parser = argparse.ArgumentParser( - description="Compute baseline metrics using benchmark factors." - ) - parser.add_argument( - "--output", - default=None, - help="Output CSV path. Defaults to baseline_metrics_by_benchmark.csv in results dir.", - ) - args = parser.parse_args() - - results_dir = get_results_dir() - factors = load_factors(FACTORS_PATH) - - csv_path = os.path.join(results_dir, "aggregated_baseline.csv") - data, benchmarks = load_baseline_csv(csv_path) - - # Compute averaged metrics for each group - base_metric = compute_metric_by_benchmark(data, factors, benchmarks, BASE_MODELS) - instruct_metric = compute_metric_by_benchmark(data, factors, benchmarks, INSTRUCT_MODELS) - - # Write output table - output_path = args.output or os.path.join(results_dir, "baseline_metrics_by_benchmark.csv") - - with open(output_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["model_type", "metric"]) - writer.writerow(["base", base_metric]) - writer.writerow(["instruct", instruct_metric]) - - print(f"Written: {output_path}") - - -if __name__ == "__main__": - main() diff --git a/scripts/compute_single_metrics.py b/scripts/compute_single_metrics.py deleted file mode 100755 index b485106a..00000000 --- a/scripts/compute_single_metrics.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python3 -""" -Compute final metric for each final_*.csv table using factors from factors.json. - -For each benchmark, computes the average value across all models, then multiplies -by the factor. Sums these to produce a single metric per method. -""" -import os -import csv -import json -import argparse - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json") - -# Expected models in each complete CSV -EXPECTED_MODELS = { - "Qwen3-1.7B-Base", - "Qwen3-4B-Base", - "SmolLM3-3B-Base", - "gemma-3-4b-pt", -} - - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def load_factors(factors_path: str) -> dict: - """Load factors from JSON file.""" - with open(factors_path, "r") as f: - return json.load(f) - - -def load_final_csv(csv_path: str, valid_benchmarks: set) -> tuple[dict, list]: - """ - Load a final CSV file into a dict: {model: {benchmark: value}}. - Only loads benchmarks that are in valid_benchmarks. - Returns (data_dict, list_of_benchmarks). - """ - data = {} - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader) - all_benchmarks = header[1:] - benchmarks = [b for b in all_benchmarks if b in valid_benchmarks] - - for row in reader: - model = row[0] - data[model] = {} - for i, bench in enumerate(all_benchmarks): - if bench not in valid_benchmarks: - continue - val = row[i + 1] - if val == "": - continue - data[model][bench] = float(val) - - return data, benchmarks - - -def compute_metric(data: dict, factors: dict, benchmarks: list) -> float: - """ - Compute weighted sum where each benchmark value is averaged across models. - """ - total = 0.0 - num_models = len(data) - for bench in benchmarks: - avg_value = sum(data[model][bench] for model in data) / num_models - factor = factors[bench] - total += avg_value * factor - return total - - -def main(): - parser = argparse.ArgumentParser( - description="Compute final metrics from final_*.csv files using benchmark factors." - ) - parser.add_argument( - "--output", - default=None, - help="Output CSV path. Defaults to final_metrics.csv in results dir.", - ) - args = parser.parse_args() - - results_dir = get_results_dir() - factors = load_factors(FACTORS_PATH) - valid_benchmarks = set(factors.keys()) - - # Find all final_*.csv files, excluding final_time_* files - final_files = [] - for filename in os.listdir(results_dir): - if not filename.startswith("final_"): - continue - if not filename.endswith(".csv"): - continue - if filename.startswith("final_time_"): - continue - # Check if file has all expected models - csv_path = os.path.join(results_dir, filename) - try: - data, _ = load_final_csv(csv_path, valid_benchmarks) - except Exception: - print(f"Warning: could not load {csv_path}.") - raise - if set(data.keys()) != EXPECTED_MODELS: - continue - final_files.append(filename) - - final_files.sort() - - # Compute metrics for each file - results = {} # {method_name: metric} - for filename in final_files: - method_name = filename[len("final_") : -len(".csv")] - csv_path = os.path.join(results_dir, filename) - data, benchmarks = load_final_csv(csv_path, valid_benchmarks) - metric = compute_metric(data, factors, benchmarks) - results[method_name] = metric - - # Write output table - output_path = args.output or os.path.join(results_dir, "final_metrics.csv") - - with open(output_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["method", "metric"]) - - for method_name in sorted(results.keys()): - writer.writerow([method_name, results[method_name]]) - - print(f"Written: {output_path}") - - -if __name__ == "__main__": - main() diff --git a/scripts/compute_single_metrics_avg_stddev.py b/scripts/compute_single_metrics_avg_stddev.py deleted file mode 100755 index 874284a7..00000000 --- a/scripts/compute_single_metrics_avg_stddev.py +++ /dev/null @@ -1,162 +0,0 @@ -#!/usr/bin/env python3 -""" -Compute final metric for each final_*.csv table using factors from factors.json, -then aggregate across multiple runs to produce averages and standard deviations. - -For each benchmark, computes the average value across all models, then multiplies -by the factor. Sums these to produce a single metric per method. - -Then, for each agent group defined in HARDCODED_AGENT_MAP, computes the average -and standard deviation of the metrics across runs. -""" -import os -import csv -import json -import math - -from constants import HARDCODED_AGENT_MAP, HARDCODED_BENCHMARKS - -SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) -FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json") - -# Expected models in each complete CSV -EXPECTED_MODELS = { - "Qwen3-1.7B-Base", - "Qwen3-4B-Base", - "SmolLM3-3B-Base", - "gemma-3-4b-pt", -} - -def get_results_dir(): - return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") - - -def mean(values: list[float]) -> float: - return sum(values) / len(values) - - -def stddev(values: list[float]) -> float: - avg = mean(values) - variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1) - return math.sqrt(variance) - - -def load_factors(factors_path: str) -> dict: - """Load factors from JSON file.""" - with open(factors_path, "r") as f: - return json.load(f) - - -def load_final_csv(csv_path: str, valid_benchmarks: set) -> tuple[dict, list]: - """ - Load a final CSV file into a dict: {model: {benchmark: value}}. - Only loads benchmarks that are in valid_benchmarks. - Returns (data_dict, list_of_benchmarks). - """ - data = {} - with open(csv_path, "r", newline="") as f: - reader = csv.reader(f) - header = next(reader) - all_benchmarks = header[1:] - benchmarks = [b for b in all_benchmarks if b in valid_benchmarks] - - for row in reader: - model = row[0] - data[model] = {} - for i, bench in enumerate(all_benchmarks): - if bench not in valid_benchmarks: - continue - val = row[i + 1] - if val == "": - continue - data[model][bench] = float(val) - - return data, benchmarks - - -def compute_metric(data: dict, factors: dict, benchmarks: list) -> float: - """ - Compute weighted sum where each benchmark value is averaged across models. - """ - total = 0.0 - num_models = len(data) - for bench in benchmarks: - avg_value = sum(data[model][bench] for model in data) / num_models - factor = factors[bench] - total += avg_value * factor - return total - - -def compute_all_metrics(results_dir: str, factors: dict) -> dict[str, float]: - """ - Compute metrics for all final_*.csv files. - Returns {method_name: metric}. - """ - valid_benchmarks = set(factors.keys()) - results = {} - - for filename in os.listdir(results_dir): - if not filename.startswith("final_"): - continue - if not filename.endswith(".csv"): - continue - if filename.startswith("final_time_"): - continue - - csv_path = os.path.join(results_dir, filename) - data, benchmarks = load_final_csv(csv_path, valid_benchmarks) - - if set(data.keys()) != EXPECTED_MODELS: - continue - - method_name = filename[len("final_") : -len(".csv")] - metric = compute_metric(data, factors, benchmarks) - results[method_name] = metric - - return results - - -def main(): - results_dir = get_results_dir() - factors = load_factors(FACTORS_PATH) - - # Compute metrics for all methods - all_metrics = compute_all_metrics(results_dir, factors) - - # Write individual metrics CSV - metrics_path = os.path.join(results_dir, "single_metrics.csv") - with open(metrics_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["method", "metric"]) - for method_name in sorted(all_metrics.keys()): - writer.writerow([method_name, all_metrics[method_name]]) - print(f"Written: {metrics_path}") - - # Compute aggregated metrics for each agent group - aggregated_results = {} - - for agent_name, method_names in HARDCODED_AGENT_MAP.items(): - metrics = [] - for method_name in method_names: - metric = all_metrics[method_name] - metrics.append(metric) - - aggregated_results[agent_name] = { - "avg": mean(metrics), - "std": stddev(metrics), - "n": len(metrics), - } - - # Write aggregated metrics CSV - aggregated_path = os.path.join(results_dir, "single_metrics_aggregated.csv") - with open(aggregated_path, "w", newline="") as f: - writer = csv.writer(f) - writer.writerow(["agent", "avg", "std", "n"]) - for agent_name in sorted(aggregated_results.keys()): - data = aggregated_results[agent_name] - writer.writerow([agent_name, data["avg"], data["std"], data["n"]]) - print(f"Written: {aggregated_path}") - - -if __name__ == "__main__": - main() diff --git a/scripts/constants.py b/scripts/constants.py index c1dbd615..c45bcefa 100644 --- a/scripts/constants.py +++ b/scripts/constants.py @@ -62,7 +62,13 @@ "claude_non_api_claude-opus-4-6_1m__10h_run1", "claude_non_api_claude-opus-4-6_1m__10h_run2", "claude_non_api_claude-opus-4-6_1m__10h_run3" - ] + ], + + "Opus-4.7":[ + "claude_non_api_claude-opus-4-7_10h", + "claude_non_api_claude-opus-4-7_10h_run2", + "claude_non_api_claude-opus-4-7_10h_run3" + ] } diff --git a/scripts/rerun_eval_n_times.sh b/scripts/rerun_eval_n_times.sh new file mode 100755 index 00000000..c3b89263 --- /dev/null +++ b/scripts/rerun_eval_n_times.sh @@ -0,0 +1,152 @@ +#!/bin/bash +# Re-run the per-task evaluate.py N times on an already-finished EVAL_DIR +# and aggregate per-run metrics into /metrics_averaged.json. +# +# Usage: +# scripts/rerun_eval_n_times.sh [N] +# +# Defaults: N=5. +# +# Mirrors run_task.sh's evaluation step: runs src/eval/tasks//evaluate.py +# (NOT the snapshot in $EVAL_DIR/task) under the same vllm_debug container with +# the same fuse-overlayfs HF cache and the same max-tokens fallback ladder. +# +# Run from the repo root, on a node with GPUs (submit via +# src/commit_utils/rerun_eval.sub for cluster execution). +set -euo pipefail + +if [ "$#" -lt 1 ]; then + echo "usage: $0 [N]" >&2 + exit 1 +fi + +EVAL_DIR="$(realpath "$1")" +N="${2:-5}" + +if [ ! -d "$EVAL_DIR/final_model" ]; then + echo "ERROR: $EVAL_DIR/final_model not found" >&2 + exit 1 +fi + +source src/commit_utils/set_env_vars.sh + +# Derive the task name from the EVAL_DIR basename: __. +EVAL_BASENAME="$(basename "$EVAL_DIR")" +EVALUATION_TASK="${EVAL_BASENAME%%_*}" + +if [ ! -f "src/eval/tasks/${EVALUATION_TASK}/evaluate.py" ]; then + echo "ERROR: src/eval/tasks/${EVALUATION_TASK}/evaluate.py not found" >&2 + echo " (parsed task '${EVALUATION_TASK}' from $(basename "$EVAL_DIR"))" >&2 + exit 1 +fi + +REPO_ROOT="$(pwd)" +RERUNS_DIR="$EVAL_DIR/reruns" +mkdir -p "$RERUNS_DIR" + +# Per-task max-tokens fallback ladder, mirroring run_task.sh. +case "$EVALUATION_TASK" in + aime2025) FB1="--max-tokens 12000"; FB2="--max-tokens 8000" ;; + arenahardwriting) FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;; + bfcl) FB1="--max-tokens 12000"; FB2="--max-tokens 8000" ;; + gpqamain) FB1="--max-tokens 12000"; FB2="--max-tokens 8000" ;; + gsm8k) FB1="--max-tokens 3000"; FB2="--max-tokens 2000" ;; + healthbench) FB1="--max-new-tokens 12288"; FB2="--max-new-tokens 8192" ;; + humaneval) FB1="--max-tokens 3000"; FB2="--max-tokens 2000" ;; + *) FB1=""; FB2="" ;; +esac + +# Fuse-overlayfs HF cache so reruns don't pollute the shared HF cache, +# matching run_task.sh's with_huggingface_overlay helper. +TMP_SUBDIR="/tmp/rerun_eval_$$" +HF_MERGED="${TMP_SUBDIR}/merged_huggingface" +TMP_HF_CACHE="/tmp/hf_cache_rerun_$$" + +setup_overlay() { + mkdir -p "${TMP_SUBDIR}/upper_huggingface" + mkdir -p "${TMP_SUBDIR}/fuse_workdir" + mkdir -p "${HF_MERGED}" + fuse-overlayfs -o \ + "lowerdir=${HF_HOME},upperdir=${TMP_SUBDIR}/upper_huggingface,workdir=${TMP_SUBDIR}/fuse_workdir" \ + "${HF_MERGED}" +} + +teardown_overlay() { + fusermount -u "${HF_MERGED}" 2>/dev/null || true + rm -rf "${TMP_SUBDIR}" 2>/dev/null || true +} +trap teardown_overlay EXIT + +setup_overlay + +run_one() { + local out_json="$1" + local extra="$2" + local log="$3" + + nvidia-smi --query-compute-apps=pid --format=csv,noheader 2>/dev/null \ + | xargs -r kill -9 2>/dev/null || true + sleep 5 + + timeout --signal=TERM --kill-after=60s 28800s \ + apptainer exec \ + --nv \ + --env "HF_HOME=${TMP_HF_CACHE}" \ + --env OPENAI_API_KEY="${OPENAI_API_KEY:-}" \ + --env VLLM_API_KEY="inspectai" \ + --env PYTHONNOUSERSITE="1" \ + --writable-tmpfs \ + --bind "${REPO_ROOT}:${REPO_ROOT}" \ + --bind "${HF_MERGED}:${TMP_HF_CACHE}" \ + --pwd "${REPO_ROOT}/src/eval/tasks/${EVALUATION_TASK}" \ + "${POST_TRAIN_BENCH_CONTAINERS_DIR}/${POST_TRAIN_BENCH_CONTAINER_NAME}.sif" \ + python evaluate.py \ + --model-path "${EVAL_DIR}/final_model" \ + --templates-dir ../../../../src/eval/templates \ + --limit -1 \ + ${extra} \ + --json-output-file "${out_json}" >"${log}" 2>&1 +} + +run_with_fallback() { + local out_json="$1" + local log_prefix="$2" + + rm -f "$out_json" + + for level in default fb1 fb2; do + local extra="" + case "$level" in + default) extra="" ;; + fb1) extra="$FB1" ;; + fb2) extra="$FB2" ;; + esac + echo " [$level] extra='${extra}'" + run_one "$out_json" "$extra" "${log_prefix}_${level}.log" || true + if [ -f "$out_json" ]; then + return 0 + fi + done + return 1 +} + +echo "EVAL_DIR=${EVAL_DIR}" +echo "EVALUATION_TASK=${EVALUATION_TASK}" +echo "N=${N}" + +for i in $(seq 1 "$N"); do + out="${RERUNS_DIR}/run_${i}.json" + log_prefix="${RERUNS_DIR}/run_${i}" + echo "=== rerun ${i} / ${N} ===" + if run_with_fallback "$out" "$log_prefix"; then + echo " -> wrote $out" + else + echo " -> FAILED all fallbacks for rerun ${i}" + fi +done + +python scripts/aggregate_metrics_runs.py \ + --runs-glob "${RERUNS_DIR}/run_*.json" \ + --output "${EVAL_DIR}/metrics_averaged.json" + +echo "Wrote ${EVAL_DIR}/metrics_averaged.json" diff --git a/scripts/utils.py b/scripts/utils.py new file mode 100644 index 00000000..ab6af6d8 --- /dev/null +++ b/scripts/utils.py @@ -0,0 +1,410 @@ +#!/usr/bin/env python3 +"""Shared constants and utility functions for aggregation scripts.""" +import csv +import json +import math +import os +import re + + +# --------------------------------------------------------------------------- +# Constants +# --------------------------------------------------------------------------- + +SCRIPT_DIR = os.path.dirname(os.path.abspath(__file__)) +FACTORS_PATH = os.path.join(SCRIPT_DIR, "factors.json") +BASELINES_PATH = os.path.join(SCRIPT_DIR, "baselines.json") + +HARDCODED_AGENT_MAP = { + "Opus-4.5": [ + "claude_claude-opus-4-5_10h_final_v3", + "claude_claude-opus-4-5_10h_v5", + "claude_claude-opus-4-5_10h_v6_seed1", + ], + "GPT-5.1-Codex-Max": [ + "codex_gpt-5.1-codex-max_10h_final_v3", + "codex_gpt-5.1-codex-max_10h_v4_seed1", + "codex_gpt-5.1-codex-max_10h_v4_seed2", + ], + "GPT-5.2-Codex": [ + "codex_gpt-5.2-codex_10h_v6", + "codex_gpt-5.2-codex_10h_v6_seed1", + "codex_gpt-5.2-codex_10h_v6_seed2", + ], + "GPT-5.2": [ + "codex_gpt-5.2_10h_v4", + "codex_gpt-5.2_10h_v6_seed1", + "codex_gpt-5.2_10h_v6_seed2", + ], + "Gemini-3-Pro": [ + "gemini_models_gemini-3-pro-preview_10h_final_v3", + "gemini_models_gemini-3-pro-preview_10h_v5", + "gemini_models_gemini-3-pro-preview_10h_v6_seed1", + ], + "GPT-5.1-Codex-Max Low": [ + "codexlow_gpt-5.1-codex-max_10h_v7", + "codexlow_gpt-5.1-codex-max_10h_v7_seed1", + ], + "GPT-5.1-Codex-Max High": [ + "codexhigh_gpt-5.1-codex-max_10h_v7", + "codexhigh_gpt-5.1-codex-max_10h_v7_seed1", + ], + "Opus-4.6": [ + "claude_claude-opus-4-6_10h_run1_old_container", + "claude_claude-opus-4-6_10h_run2", + "claude_claude-opus-4-6_10h_run3", + ], + "GPT-5.3-Codex_Med": [ + "codex_non_api_gpt-5.3-codex_10h_run1", + "codex_non_api_gpt-5.3-codex_10h_run2", + "codex_non_api_gpt-5.3-codex_10h_run3", + ], + "Gemini-3.1-Pro": [ + "opencode_opencode_gemini-3.1-pro_10h_run1", + "opencode_opencode_gemini-3.1-pro_10h_run2", + "opencode_opencode_gemini-3.1-pro_10h_run3", + ], + "GPT-5.3-Codex_High": [ + "codex_non_api_high_gpt-5.3-codex_10h_run1", + "codex_non_api_high_gpt-5.3-codex_10h_run2", + "codex_non_api_high_gpt-5.3-codex_10h_run3", + ], + "GPT-5.4-High": [ + "codex_non_api_high_gpt-5.4_10h_run1", + "codex_non_api_high_gpt-5.4_10h_run2", + "codex_non_api_high_gpt-5.4_10h_run3", + ], + "Opus-4.6-1M": [ + "claude_non_api_claude-opus-4-6_1m__10h_run1", + "claude_non_api_claude-opus-4-6_1m__10h_run2", + "claude_non_api_claude-opus-4-6_1m__10h_run3", + ], + "Opus-4.7":[ + "claude_non_api_claude-opus-4-7_10h", + "claude_non_api_claude-opus-4-7_10h_run2", + "claude_non_api_claude-opus-4-7_10h_run3", + ], + "GPT-5.5-xHigh":[ + "codex_non_api_xhigh_gpt-5.5_10h_run1", + "codex_non_api_xhigh_gpt-5.5_10h_run2", + + ] +} + +HARDCODED_BENCHMARKS = [ + "aime2025", + "arenahardwriting", + "bfcl", + "gpqamain", + "gsm8k", + "healthbench", + "humaneval", +] + +EXPECTED_MODELS = { + "Qwen3-1.7B-Base", + "Qwen3-4B-Base", + "SmolLM3-3B-Base", + "gemma-3-4b-pt", +} + +BUDGET_SECONDS = 10 * 3600 # 10 hours + + +def load_factors() -> dict: + with open(FACTORS_PATH, "r") as f: + return json.load(f) + + +def load_baselines() -> dict: + """Load hardcoded baseline data from baselines.json. + + Returns {"zeroshot": {model: {bench: value}}, "fewshot": {...}}. + Values are floats. + """ + with open(BASELINES_PATH, "r") as f: + return json.load(f) + + +def get_baseline_fallback_data() -> dict[str, dict[str, str]]: + """Load zeroshot baselines as {model: {bench: str_value}} for fallback. + + This is the replacement for reading aggregated_baseline_zeroshot.csv. + """ + baselines = load_baselines() + data = {} + for model, benchmarks in baselines["zeroshot"].items(): + data[model] = {bench: str(val) for bench, val in benchmarks.items()} + return data + + +# --------------------------------------------------------------------------- +# Stats +# --------------------------------------------------------------------------- + +def mean(values: list[float]) -> float: + return sum(values) / len(values) + + +def stddev(values: list[float]) -> float: + avg = mean(values) + variance = sum((x - avg) ** 2 for x in values) / (len(values) - 1) + return math.sqrt(variance) + + +# --------------------------------------------------------------------------- +# Paths +# --------------------------------------------------------------------------- + +def get_results_dir() -> str: + return os.environ.get("POST_TRAIN_BENCH_RESULTS_DIR", "results") + + +# --------------------------------------------------------------------------- +# CSV I/O +# --------------------------------------------------------------------------- + +def is_number(value: str) -> bool: + if not value: + return False + try: + float(value) + return True + except ValueError: + return False + + +def load_csv_as_dict(csv_path: str) -> tuple[dict[str, dict[str, str]], list[str]]: + """ + Load a CSV into {model: {benchmark: value}}. + Returns (data, benchmarks). Returns ({}, []) if file doesn't exist. + """ + data = {} + benchmarks = [] + + if not os.path.exists(csv_path): + return data, benchmarks + + with open(csv_path, "r", newline="") as f: + reader = csv.reader(f) + header = next(reader, None) + if not header: + return data, benchmarks + + benchmarks = header[1:] + + for row in reader: + if not row: + continue + model = row[0] + data[model] = {} + for i, bench in enumerate(benchmarks): + if i + 1 < len(row): + data[model][bench] = row[i + 1] + else: + data[model][bench] = "" + + return data, benchmarks + + +def write_csv( + path: str, + models: list[str], + benchmarks: list[str], + data: dict[str, dict[str, str]], +): + with open(path, "w", newline="") as f: + writer = csv.writer(f) + writer.writerow(["model"] + benchmarks) + for model in models: + row = [model] + for bench in benchmarks: + row.append(data[model].get(bench, "")) + writer.writerow(row) + + +# --------------------------------------------------------------------------- +# Walking result directories +# --------------------------------------------------------------------------- + +def walk_latest_runs( + method_path: str, + min_run_id: int | None = None, + max_run_id: int | None = None, +) -> dict[tuple[str, str], dict]: + """ + Walk a method directory and return the latest run per (benchmark, model). + + Returns {(benchmark, model): {"run_id": int, "path": str}}. + """ + latest_runs = {} + + for entry in os.listdir(method_path): + entry_path = os.path.join(method_path, entry) + if not os.path.isdir(entry_path): + continue + + try: + benchmark, _, model, run_id_str = entry.split("_") + run_id = int(run_id_str) + except ValueError: + print(entry) + raise ValueError(f"{entry}, {method_path}") + + if max_run_id is not None and run_id >= max_run_id: + continue + if min_run_id is not None and run_id < min_run_id: + continue + + key = (benchmark, model) + if key not in latest_runs or run_id > latest_runs[key]["run_id"]: + latest_runs[key] = {"run_id": run_id, "path": entry_path} + + return latest_runs + + +# --------------------------------------------------------------------------- +# Metrics loading +# --------------------------------------------------------------------------- + +def load_metrics(metrics_path: str, method_name: str = None) -> str: + """ + Return the accuracy as a string, or an error label. + + Error labels for non-baseline methods: + - "not avl." if time_taken.txt doesn't exist + - "not stored" if time_taken.txt exists but final_model/ doesn't + - "ERR" otherwise + For baseline: always "ERR" on failure. + """ + if os.path.exists(metrics_path): + try: + with open(metrics_path, "r") as f: + data = json.load(f) + acc = data.get("accuracy") + if acc is not None: + return str(acc) + except Exception: + pass + + if method_name == "baseline_zeroshot": + return "ERR" + + run_dir = os.path.dirname(metrics_path) + + if not os.path.exists(os.path.join(run_dir, "time_taken.txt")): + return "not avl." + + if not os.path.isdir(os.path.join(run_dir, "final_model")): + return "not stored" + + return "ERR" + + +# --------------------------------------------------------------------------- +# Contamination loading +# --------------------------------------------------------------------------- + +def load_contamination(contamination_path: str): + """Return True, False, "IMPORTANT ERR", or "ERR".""" + if not os.path.exists(contamination_path): + return "ERR" + try: + with open(contamination_path, "r") as f: + content = f.read().strip() + except Exception: + return "ERR" + if content == "contamination detected": + return True + elif content == "no contamination detected": + return False + else: + return "IMPORTANT ERR" + + +def load_disallowed_model(disallowed_path: str): + """Return True, False, "IMPORTANT ERR", or "ERR".""" + if not os.path.exists(disallowed_path): + return "ERR" + try: + with open(disallowed_path, "r") as f: + content = f.read().strip() + except Exception: + return "ERR" + if content == "disallowed use detected": + return True + elif content == "only allowed use detected": + return False + else: + return "IMPORTANT ERR" + + +def combine_contamination_results(contamination, disallowed_model) -> str: + """ + Combine contamination and disallowed model results into a cell value. + + Returns "" (clean), "C", "M", "MC", or an error string. + """ + if contamination in ("ERR", "IMPORTANT ERR") or disallowed_model in ( + "ERR", + "IMPORTANT ERR", + ): + errors = [] + if contamination in ("ERR", "IMPORTANT ERR"): + errors.append(f"C:{contamination}") + if disallowed_model in ("ERR", "IMPORTANT ERR"): + errors.append(f"M:{disallowed_model}") + return " ".join(errors) + + if disallowed_model and contamination: + return "MC" + elif disallowed_model and not contamination: + return "M" + elif not disallowed_model and contamination: + return "C" + else: + return "" + + +# --------------------------------------------------------------------------- +# Time loading +# --------------------------------------------------------------------------- + +def parse_time_hms(time_str: str) -> int | None: + """Parse H:M:S string to total seconds. Returns None on failure.""" + match = re.match(r"^(\d+):(\d{1,2}):(\d{1,2})$", time_str.strip()) + if not match: + return None + hours, minutes, seconds = map(int, match.groups()) + if minutes >= 60 or seconds >= 60: + return None + return hours * 3600 + minutes * 60 + seconds + + +def format_time_hms(total_seconds: int) -> str: + """Convert total seconds to H:MM:SS format.""" + hours = total_seconds // 3600 + minutes = (total_seconds % 3600) // 60 + seconds = total_seconds % 60 + return f"{hours}:{minutes:02d}:{seconds:02d}" + + +def load_time_taken(run_dir: str) -> tuple[str, int | None]: + """ + Return (display_string, total_seconds). + Returns ("ERR", None) on failure. + """ + time_taken_path = os.path.join(run_dir, "time_taken.txt") + + if not os.path.exists(time_taken_path): + return "ERR", None + + try: + with open(time_taken_path, "r") as f: + time_str = f.read().strip() + total_seconds = parse_time_hms(time_str) + if total_seconds is None: + return "ERR", None + return format_time_hms(total_seconds), total_seconds + except Exception: + return "ERR", None diff --git a/scripts/verify.py b/scripts/verify.py new file mode 100644 index 00000000..857703a6 --- /dev/null +++ b/scripts/verify.py @@ -0,0 +1,261 @@ +#!/usr/bin/env python3 +""" +Verify that refactored aggregation scripts produce identical outputs +to the original pipeline. + +Usage: + python verify.py --ground-truth /fast/hbhatnagar/ptb_results/ \ + --new-output /fast/hbhatnagar/ptb_results_new/ + +Compares all key output CSVs cell-by-cell: + - final_{method}.csv (per-method score grids) + - contamination_{method}.csv (per-method contamination flags) + - single_metrics.csv (weighted score per run) + - single_metrics_aggregated.csv (avg/std per agent group) + - aggregated_avg_{agent}.csv (per-cell avg for multi-run agents) + - aggregated_std_{agent}.csv (per-cell std for multi-run agents) + - time_aggregated.csv (avg/std time per agent) +""" +import argparse +import csv +import os +import sys + + +FLOAT_TOLERANCE = 1e-10 + + +def is_number(s: str) -> bool: + if not s: + return False + try: + float(s) + return True + except ValueError: + return False + + +def load_csv(path: str) -> list[list[str]]: + with open(path, "r", newline="") as f: + return list(csv.reader(f)) + + +def compare_csvs(gt_path: str, new_path: str) -> list[str]: + """ + Compare two CSVs cell-by-cell. + Returns list of mismatch descriptions (empty = pass). + """ + errors = [] + + gt_rows = load_csv(gt_path) + new_rows = load_csv(new_path) + + if len(gt_rows) != len(new_rows): + errors.append(f"Row count differs: {len(gt_rows)} vs {len(new_rows)}") + # Still compare what we can + max_rows = min(len(gt_rows), len(new_rows)) + else: + max_rows = len(gt_rows) + + for row_idx in range(max_rows): + gt_row = gt_rows[row_idx] + new_row = new_rows[row_idx] + + if len(gt_row) != len(new_row): + errors.append( + f" Row {row_idx}: column count differs: " + f"{len(gt_row)} vs {len(new_row)}" + ) + max_cols = min(len(gt_row), len(new_row)) + else: + max_cols = len(gt_row) + + for col_idx in range(max_cols): + gt_val = gt_row[col_idx] + new_val = new_row[col_idx] + + if gt_val == new_val: + continue + + # Try numeric comparison with tolerance + if is_number(gt_val) and is_number(new_val): + if abs(float(gt_val) - float(new_val)) < FLOAT_TOLERANCE: + continue + + # Header row for context + header_label = "" + if row_idx > 0 and gt_rows[0]: + col_name = gt_rows[0][col_idx] if col_idx < len(gt_rows[0]) else "?" + row_name = gt_row[0] if gt_row else "?" + header_label = f" ({row_name}, {col_name})" + + errors.append( + f" Row {row_idx}, Col {col_idx}{header_label}: " + f"'{gt_val}' vs '{new_val}'" + ) + + return errors + + +def find_matching_files(gt_dir: str, new_dir: str) -> dict[str, tuple[str, str]]: + """ + Find CSVs that exist in both directories, filtered to the ones we care about. + Returns {filename: (gt_path, new_path)}. + """ + matches = {} + + gt_files = set(f for f in os.listdir(gt_dir) if f.endswith(".csv")) + new_files = set(f for f in os.listdir(new_dir) if f.endswith(".csv")) + + # Files we care about + for f in sorted(gt_files & new_files): + if should_verify(f): + matches[f] = (os.path.join(gt_dir, f), os.path.join(new_dir, f)) + + return matches + + +def should_verify(filename: str) -> bool: + """Decide if a CSV file should be verified.""" + # Skip deprecated / intermediate / artifact files + if filename in ( + "aggregated_avg_over_models.csv", + "aggregated_std_over_models.csv", + ): + return False + + # Skip intermediate time CSVs (only time_aggregated.csv is a final output) + if filename.startswith("aggregated_time_"): + return False + + # Per-method final scores + if filename.startswith("final_") and filename.endswith(".csv"): + # Skip deprecated/artifact files + if filename.startswith("final_avg_"): + return False + if filename.startswith("final_std_"): + return False + if filename.startswith("final_time_"): + return False + # Skip baselines (hardcoded in baselines.json, not regenerated) + if filename in ("final_baseline.csv", "final_baseline_zeroshot.csv"): + return False + return True + + # Contamination flags + if filename.startswith("contamination_") and filename.endswith(".csv"): + # Skip baselines + if filename in ( + "contamination_baseline.csv", + "contamination_baseline_zeroshot.csv", + ): + return False + return True + + # Single metric outputs + if filename in ("single_metrics.csv", "single_metrics_aggregated.csv"): + return True + + # Per-agent avg/std (multi-run agents) + if filename.startswith("aggregated_avg_") or filename.startswith("aggregated_std_"): + return True + + # Time aggregation + if filename == "time_aggregated.csv": + return True + + return False + + +def main(): + parser = argparse.ArgumentParser( + description="Verify refactored aggregation outputs match ground truth." + ) + parser.add_argument( + "--ground-truth", + required=True, + help="Directory with ground truth CSV outputs (from original scripts).", + ) + parser.add_argument( + "--new-output", + required=True, + help="Directory with new CSV outputs (from refactored scripts).", + ) + args = parser.parse_args() + + gt_dir = args.ground_truth + new_dir = args.new_output + + if not os.path.isdir(gt_dir): + print(f"Error: ground truth dir not found: {gt_dir}") + sys.exit(1) + if not os.path.isdir(new_dir): + print(f"Error: new output dir not found: {new_dir}") + sys.exit(1) + + matches = find_matching_files(gt_dir, new_dir) + + if not matches: + print("No matching CSV files found to compare.") + sys.exit(1) + + # Check for files in ground truth that are missing from new output + gt_verifiable = set( + f for f in os.listdir(gt_dir) if f.endswith(".csv") and should_verify(f) + ) + new_verifiable = set( + f for f in os.listdir(new_dir) if f.endswith(".csv") and should_verify(f) + ) + + missing_from_new = gt_verifiable - new_verifiable + extra_in_new = new_verifiable - gt_verifiable + + total_files = len(matches) + passed = 0 + failed = 0 + failure_details = [] + + print(f"Comparing {total_files} CSV files...\n") + + for filename, (gt_path, new_path) in sorted(matches.items()): + errors = compare_csvs(gt_path, new_path) + if errors: + failed += 1 + failure_details.append((filename, errors)) + print(f" FAIL {filename}") + else: + passed += 1 + print(f" PASS {filename}") + + # Summary + print(f"\n{'='*60}") + print(f"Results: {passed} passed, {failed} failed, {total_files} total") + + if missing_from_new: + print(f"\nMISSING from new output ({len(missing_from_new)}):") + for f in sorted(missing_from_new): + print(f" - {f}") + + if extra_in_new: + print(f"\nEXTRA in new output ({len(extra_in_new)}):") + for f in sorted(extra_in_new): + print(f" + {f}") + + if failure_details: + print(f"\nFailure details:") + for filename, errors in failure_details: + print(f"\n {filename}:") + for err in errors[:10]: # Cap at 10 errors per file + print(f" {err}") + if len(errors) > 10: + print(f" ... and {len(errors) - 10} more") + + if failed or missing_from_new: + sys.exit(1) + else: + print("\nAll checks passed.") + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/src/commit_utils/commit.sh b/src/commit_utils/commit.sh index 34abd374..3c43144e 100644 --- a/src/commit_utils/commit.sh +++ b/src/commit_utils/commit.sh @@ -2,71 +2,83 @@ source src/commit_utils/set_env_vars.sh models=( - "google/gemma-3-4b-pt" + # "google/gemma-3-4b-pt" "Qwen/Qwen3-4B-Base" - "Qwen/Qwen3-1.7B-Base" - "HuggingFaceTB/SmolLM3-3B-Base" + # "Qwen/Qwen3-1.7B-Base" + # "HuggingFaceTB/SmolLM3-3B-Base" ) evals=( - "aime2025" - "arenahardwriting" - "bfcl" - "gpqamain" - "gsm8k" - "humaneval" + # "aime2025" + # "arenahardwriting" + # "bfcl" + # "gpqamain" + # "gsm8k" + # "humaneval" "healthbench" ) -# export POST_TRAIN_BENCH_EXPERIMENT_NAME="_pushed" +export POST_TRAIN_BENCH_EXPERIMENT_NAME="_METR" for model in "${models[@]}"; do for eval in "${evals[@]}"; do echo "" echo $model on $eval if [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor_mpi-is" ]; then # Proprietary (API) - condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude" -a "agent_config=claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=qwen3max" -a "agent_config=qwen3-max-2026-01-23" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub # Proprietary (Subscription plan) - condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 150 -a "agent=claude_non_api" -a "agent_config=claude-sonnet-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.2" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=50" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-7" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_xhigh" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # Multi-GPU runs might need more than 8 CPUs and 128 GB of RAM (use 512 GB to be safe) - condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" src/commit_utils/single_task.sub - condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub + condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=100" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" -a "request_memory=524288" -a "request_cpus=128" -a "request_disk=800G" src/commit_utils/single_task.sub + # condor_submit_bid 500 -a "agent=claude_non_api" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=8" -a "num_hours=50" src/commit_utils/single_task.sub - # Reprompted variant to push the agent (such as GPT 5.4) - condor_submit_bid 100 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # # Reprompted variant to push the agent (such as GPT 5.4) + # condor_submit_bid 50 -a "agent=codex_non_api_high_reprompt" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_xhigh_reprompt" -a "agent_config=gpt-5.5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_reprompt" -a "agent_config=claude-opus-4-6[1m]" -a "eval=$eval" -a "model_to_train=$model" -a "num_gpus=1" -a "num_hours=5" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + + + # condor_submit_bid 100 -a "agent=codex_non_api_high" -a "agent_config=gpt-5.4" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=codex_non_api_xhigh" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=claude_non_api_max" -a "agent_config=claude-opus-4-6" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=claude" -a "agent_config=claude-sonnet-4-5" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=1" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=gemini" -a "agent_config=models/gemini-3-flash-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 150 -a "agent=gemini" -a "agent_config=models/gemini-3.1-pro-preview" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub + # OpenCode - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub - sleep 10 + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=anthropic/claude-opus-4-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/kimi-k2-thinking" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-4.7-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 500 -a "agent=opencode" -a "agent_config=opencode/gemini-3-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.1-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=glm5" -a "agent_config=glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/minimax-m2.5-free" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=zai/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 100 -a "agent=opencode" -a "agent_config=opencode/kimi-k2.5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 50 -a "agent=opencode" -a "agent_config=opencode/glm-5" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # condor_submit_bid 150 -a "agent=opencode" -a "agent_config=opencode/gemini-3.1-pro" -a "eval=$eval" -a "model_to_train=$model" "num_hours=10" src/commit_utils/single_task.sub + # sleep 10 elif [ "${POST_TRAIN_BENCH_JOB_SCHEDULER}" = "htcondor" ]; then condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.1-codex-max" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub condor_submit_bid -a "agent=codex" -a "agent_config=gpt-5.3-codex" -a "eval=$eval" -a "model_to_train=$model" -a "num_hours=10" src/commit_utils/single_task.sub diff --git a/src/commit_utils/rerun_eval.sub b/src/commit_utils/rerun_eval.sub new file mode 100644 index 00000000..62aa4a17 --- /dev/null +++ b/src/commit_utils/rerun_eval.sub @@ -0,0 +1,15 @@ +executable = /bin/bash +num_gpus = 1 +n = 5 +arguments = scripts/rerun_eval_n_times.sh $(eval_dir) $(n) +environment = "OPENAI_API_KEY=$ENV(OPENAI_API_KEY) HOME=$ENV(HOME) POST_TRAIN_BENCH_RESULTS_DIR=$ENV(POST_TRAIN_BENCH_RESULTS_DIR) POST_TRAIN_BENCH_CONTAINERS_DIR=$ENV(POST_TRAIN_BENCH_CONTAINERS_DIR) POST_TRAIN_BENCH_CONTAINER_NAME=$ENV(POST_TRAIN_BENCH_CONTAINER_NAME) POST_TRAIN_BENCH_JOB_SCHEDULER=$ENV(POST_TRAIN_BENCH_JOB_SCHEDULER) HF_HOME=$ENV(HF_HOME)" +error = rerun_$(Cluster).err +output = rerun_$(Cluster).out +log = rerun_$(Cluster).log +request_memory = 131072 +request_cpus = 16 +request_gpus = $(num_gpus) +requirements = TARGET.CUDADeviceName == "NVIDIA H100 80GB HBM3" && Machine != "i104.internal.cluster.is.localnet" +request_disk=400G ++BypassLXCfs="true" +queue