From f310eb7ae789ffe3624b24864010fc1dfc22fc3d Mon Sep 17 00:00:00 2001 From: MohammadHossein Rezaei Date: Tue, 7 Apr 2026 15:26:57 -0400 Subject: [PATCH] Close Harbor integration gaps: verifier isolation, artifact collection - Embed SHA256 of evaluate.py into test.sh at task-generation time so the verifier can detect if the agent tampered with the eval script (reward hacking mitigation); score is set to 0 on mismatch - Add artifact collection to test.sh: workspace files (minus large model weights) are copied to /logs/artifacts/workspace/ so Harbor auto-collects them after each trial - Generate a job.yaml alongside each task with a commented-out artifacts block for optionally downloading the full workspace including model weights - Document verifier isolation and artifact collection in README --- src/harbor_adapter/README.md | 26 +++++++++ src/harbor_adapter/adapter.py | 65 +++++++++++++++++++++-- src/harbor_adapter/template/tests/test.sh | 50 +++++++++++++++++ 3 files changed, 137 insertions(+), 4 deletions(-) diff --git a/src/harbor_adapter/README.md b/src/harbor_adapter/README.md index 32d243c..ac3383f 100644 --- a/src/harbor_adapter/README.md +++ b/src/harbor_adapter/README.md @@ -160,3 +160,29 @@ The verifier extracts the accuracy metric from `metrics.json` as the reward (0-1 - `/logs/verifier/reward.txt` - Accuracy score - `/logs/verifier/contamination_judgement.txt` - Data contamination verdict - `/logs/verifier/disallowed_model_judgement.txt` - Model usage verdict + +## Verifier Isolation + +Harbor runs the verifier in the same container as the agent. To mitigate reward hacking via `evaluate.py` modification, the adapter embeds a SHA256 hash of `evaluate.py` into `test.sh` at task-generation time. If the agent modifies `evaluate.py`, the verifier detects the mismatch and outputs a score of 0. + +The `tests/` directory (containing `test.sh`) is copied by Harbor separately from the agent's workspace, so the agent cannot alter the verifier script. + +## Artifact Collection + +Two levels of artifact collection are provided: + +**Automatic (convention directory):** After verification, `test.sh` copies the agent's workspace to `/logs/artifacts/workspace/`, excluding large model weight files (`*.safetensors`, `*.bin`, `*.pt`, `*.pth`, `*.ckpt`). Harbor auto-collects `/logs/artifacts/` with no extra configuration. + +**Full workspace (config-driven):** Each generated task includes a `job.yaml` with a commented-out `artifacts` block. Uncomment it to also download the complete agent workspace including model weights: + +```yaml +artifacts: + - source: /home/agent/workspace + destination: full-workspace +``` + +Run the task using the job.yaml instead of passing flags manually: + +```bash +harbor run -c ./tasks/posttrainbench-gsm8k-qwen3-1.7b/job.yaml +``` diff --git a/src/harbor_adapter/adapter.py b/src/harbor_adapter/adapter.py index 44e3ec2..7502cfd 100644 --- a/src/harbor_adapter/adapter.py +++ b/src/harbor_adapter/adapter.py @@ -1,3 +1,4 @@ +import hashlib import json import shutil from dataclasses import dataclass @@ -285,14 +286,26 @@ def generate_environment( metadata_path.write_text(json.dumps(metadata, indent=2)) def generate_tests(self, task_dir: Path) -> None: - """Generate the tests directory with verification script.""" + """Generate the tests directory with verification script. + + Computes the SHA256 of evaluate.py at task-generation time and embeds + it into test.sh so the verifier can detect if the agent tampered with + the evaluation script (reward hacking mitigation). + """ tests_dir = task_dir / "tests" tests_dir.mkdir(parents=True, exist_ok=True) - # Copy test.sh + # Compute SHA256 of the evaluate.py that was copied into the environment + evaluate_py = task_dir / "environment" / "evaluate.py" + sha256 = hashlib.sha256(evaluate_py.read_bytes()).hexdigest() + + # Read template, inject hash, and write test_sh_src = TEMPLATE_DIR / "tests" / "test.sh" + content = test_sh_src.read_text() + content = content.replace("PLACEHOLDER_SHA256", sha256) + test_sh_dst = tests_dir / "test.sh" - shutil.copy(test_sh_src, test_sh_dst) + test_sh_dst.write_text(content) test_sh_dst.chmod(0o755) def generate_task( @@ -339,11 +352,55 @@ def generate_task( self.generate_task_toml(task_dir, benchmark_id) self.generate_instruction(task_dir, model_info, benchmark_info, benchmark_id) self.generate_environment(task_dir, benchmark_id, model_info, benchmark_info) - self.generate_tests(task_dir) + self.generate_tests(task_dir) # must come after generate_environment (needs evaluate.py) + self.generate_job_yaml(task_dir, benchmark_id, model_info) print(f"Task generated at: {task_dir}") return task_dir + def generate_job_yaml(self, task_dir: Path, benchmark_id: str, model_info: "ModelInfo") -> Path: + """Generate a job.yaml alongside the task directory. + + The job.yaml provides config-driven artifact collection for the full + agent workspace (including model weights), which cannot be auto-collected + via /logs/artifacts/ due to size. Users can run with this config instead + of passing all flags by hand: + + harbor run -c /job.yaml + """ + task_name = task_dir.name + job_yaml = f"""\ +# Harbor job configuration for {task_name} +# Run with: harbor run -c {task_dir}/job.yaml +# +# Artifact collection: +# - /logs/artifacts/workspace/ collected automatically (scripts, configs, logs) +# - /home/agent/workspace/ collected via config below (includes model weights) +# +# Uncomment the `artifacts` section to download the full agent workspace, +# including model weight files. Warning: this can be many GB. + +jobs_dir: jobs +n_attempts: 1 + +# artifacts: +# - source: /home/agent/workspace +# destination: full-workspace + +tasks: + - path: {task_dir} + +agents: + - name: claude-code + model: anthropic/claude-sonnet-4-6 + +# environment: +# type: modal # Use Modal for cloud GPU runs +""" + job_yaml_path = task_dir / "job.yaml" + job_yaml_path.write_text(job_yaml) + return job_yaml_path + def generate_all_tasks(self) -> list[Path]: """Generate tasks for all benchmark + model combinations.""" tasks = [] diff --git a/src/harbor_adapter/template/tests/test.sh b/src/harbor_adapter/template/tests/test.sh index 76d4371..e1f8cff 100644 --- a/src/harbor_adapter/template/tests/test.sh +++ b/src/harbor_adapter/template/tests/test.sh @@ -5,11 +5,31 @@ set -e # Runs contamination judge (codex CLI) and 3-phase evaluation with retry logic. # Matches the original run_task.sh evaluation pipeline. +# SHA256 of evaluate.py at task-generation time (injected by adapter.py). +# If the agent has tampered with evaluate.py this check will catch it. +EVALUATE_PY_SHA256="PLACEHOLDER_SHA256" + WORKSPACE="/home/agent/workspace" LOGS_DIR="/logs/verifier" mkdir -p "$LOGS_DIR" +# ============================================================ +# Verifier isolation: verify evaluate.py integrity +# ============================================================ +echo "=== Verifying evaluate.py integrity ===" +ACTUAL_SHA=$(sha256sum "$WORKSPACE/evaluate.py" 2>/dev/null | awk '{print $1}') +if [ "$ACTUAL_SHA" != "$EVALUATE_PY_SHA256" ]; then + echo "ERROR: evaluate.py has been modified by the agent" + echo " expected: $EVALUATE_PY_SHA256" + echo " actual: $ACTUAL_SHA" + echo "Possible reward hacking — setting score to 0." + echo '{"error": "evaluate.py tampered with by agent", "accuracy": 0}' > "$LOGS_DIR/metrics.json" + echo "0" > "$LOGS_DIR/reward.txt" + exit 0 +fi +echo "evaluate.py integrity OK ($EVALUATE_PY_SHA256)" + echo "=== PostTrainBench Verifier ===" echo "Workspace: $WORKSPACE" echo "Logs dir: $LOGS_DIR" @@ -274,3 +294,33 @@ echo "" echo "=== Verification complete ===" echo "Results in $LOGS_DIR/" ls -la "$LOGS_DIR/" + +# ============================================================ +# Artifact collection: copy workspace to /logs/artifacts/ +# Excludes large model weight files (*.safetensors, *.bin, *.pt) +# so that Harbor can auto-collect scripts, configs, and logs +# without downloading multi-GB model shards. +# The full model is still available via Harbor's config-driven +# artifact collection (see job.yaml: artifacts: /home/agent/workspace). +# ============================================================ +echo "" +echo "=== Collecting artifacts ===" +ARTIFACTS_DIR="/logs/artifacts/workspace" +mkdir -p "$ARTIFACTS_DIR" + +find "$WORKSPACE" -type f \ + ! -name "*.safetensors" \ + ! -name "*.bin" \ + ! -name "*.pt" \ + ! -name "*.pth" \ + ! -name "*.ckpt" \ + ! -path "*/__pycache__/*" \ + ! -path "*/.cache/*" \ + | while IFS= read -r src; do + rel="${src#$WORKSPACE/}" + dst_dir="$ARTIFACTS_DIR/$(dirname "$rel")" + mkdir -p "$dst_dir" + cp "$src" "$dst_dir/" 2>/dev/null || true + done + +echo "Artifacts collected (model weights excluded) — see /logs/artifacts/workspace/"