From f310eb7ae789ffe3624b24864010fc1dfc22fc3d Mon Sep 17 00:00:00 2001
From: MohammadHossein Rezaei <mhrmhr2004@gmail.com>
Date: Tue, 7 Apr 2026 15:26:57 -0400
Subject: [PATCH] Close Harbor integration gaps: verifier isolation, artifact
 collection

- Embed SHA256 of evaluate.py into test.sh at task-generation time so
  the verifier can detect if the agent tampered with the eval script
  (reward hacking mitigation); score is set to 0 on mismatch

- Add artifact collection to test.sh: workspace files (minus large model
  weights) are copied to /logs/artifacts/workspace/ so Harbor auto-collects
  them after each trial

- Generate a job.yaml alongside each task with a commented-out artifacts
  block for optionally downloading the full workspace including model weights

- Document verifier isolation and artifact collection in README
---
 src/harbor_adapter/README.md              | 26 +++++++++
 src/harbor_adapter/adapter.py             | 65 +++++++++++++++++++++--
 src/harbor_adapter/template/tests/test.sh | 50 +++++++++++++++++
 3 files changed, 137 insertions(+), 4 deletions(-)

diff --git a/src/harbor_adapter/README.md b/src/harbor_adapter/README.md
index 32d243c..ac3383f 100644
--- a/src/harbor_adapter/README.md
+++ b/src/harbor_adapter/README.md
@@ -160,3 +160,29 @@ The verifier extracts the accuracy metric from `metrics.json` as the reward (0-1
 - `/logs/verifier/reward.txt` - Accuracy score
 - `/logs/verifier/contamination_judgement.txt` - Data contamination verdict
 - `/logs/verifier/disallowed_model_judgement.txt` - Model usage verdict
+
+## Verifier Isolation
+
+Harbor runs the verifier in the same container as the agent. To mitigate reward hacking via `evaluate.py` modification, the adapter embeds a SHA256 hash of `evaluate.py` into `test.sh` at task-generation time. If the agent modifies `evaluate.py`, the verifier detects the mismatch and outputs a score of 0.
+
+The `tests/` directory (containing `test.sh`) is copied by Harbor separately from the agent's workspace, so the agent cannot alter the verifier script.
+
+## Artifact Collection
+
+Two levels of artifact collection are provided:
+
+**Automatic (convention directory):** After verification, `test.sh` copies the agent's workspace to `/logs/artifacts/workspace/`, excluding large model weight files (`*.safetensors`, `*.bin`, `*.pt`, `*.pth`, `*.ckpt`). Harbor auto-collects `/logs/artifacts/` with no extra configuration.
+
+**Full workspace (config-driven):** Each generated task includes a `job.yaml` with a commented-out `artifacts` block. Uncomment it to also download the complete agent workspace including model weights:
+
+```yaml
+artifacts:
+  - source: /home/agent/workspace
+    destination: full-workspace
+```
+
+Run the task using the job.yaml instead of passing flags manually:
+
+```bash
+harbor run -c ./tasks/posttrainbench-gsm8k-qwen3-1.7b/job.yaml
+```
diff --git a/src/harbor_adapter/adapter.py b/src/harbor_adapter/adapter.py
index 44e3ec2..7502cfd 100644
--- a/src/harbor_adapter/adapter.py
+++ b/src/harbor_adapter/adapter.py
@@ -1,3 +1,4 @@
+import hashlib
 import json
 import shutil
 from dataclasses import dataclass
@@ -285,14 +286,26 @@ def generate_environment(
         metadata_path.write_text(json.dumps(metadata, indent=2))
 
     def generate_tests(self, task_dir: Path) -> None:
-        """Generate the tests directory with verification script."""
+        """Generate the tests directory with verification script.
+
+        Computes the SHA256 of evaluate.py at task-generation time and embeds
+        it into test.sh so the verifier can detect if the agent tampered with
+        the evaluation script (reward hacking mitigation).
+        """
         tests_dir = task_dir / "tests"
         tests_dir.mkdir(parents=True, exist_ok=True)
 
-        # Copy test.sh
+        # Compute SHA256 of the evaluate.py that was copied into the environment
+        evaluate_py = task_dir / "environment" / "evaluate.py"
+        sha256 = hashlib.sha256(evaluate_py.read_bytes()).hexdigest()
+
+        # Read template, inject hash, and write
         test_sh_src = TEMPLATE_DIR / "tests" / "test.sh"
+        content = test_sh_src.read_text()
+        content = content.replace("PLACEHOLDER_SHA256", sha256)
+
         test_sh_dst = tests_dir / "test.sh"
-        shutil.copy(test_sh_src, test_sh_dst)
+        test_sh_dst.write_text(content)
         test_sh_dst.chmod(0o755)
 
     def generate_task(
@@ -339,11 +352,55 @@ def generate_task(
         self.generate_task_toml(task_dir, benchmark_id)
         self.generate_instruction(task_dir, model_info, benchmark_info, benchmark_id)
         self.generate_environment(task_dir, benchmark_id, model_info, benchmark_info)
-        self.generate_tests(task_dir)
+        self.generate_tests(task_dir)  # must come after generate_environment (needs evaluate.py)
+        self.generate_job_yaml(task_dir, benchmark_id, model_info)
 
         print(f"Task generated at: {task_dir}")
         return task_dir
 
+    def generate_job_yaml(self, task_dir: Path, benchmark_id: str, model_info: "ModelInfo") -> Path:
+        """Generate a job.yaml alongside the task directory.
+
+        The job.yaml provides config-driven artifact collection for the full
+        agent workspace (including model weights), which cannot be auto-collected
+        via /logs/artifacts/ due to size. Users can run with this config instead
+        of passing all flags by hand:
+
+            harbor run -c <task_dir>/job.yaml
+        """
+        task_name = task_dir.name
+        job_yaml = f"""\
+# Harbor job configuration for {task_name}
+# Run with: harbor run -c {task_dir}/job.yaml
+#
+# Artifact collection:
+#   - /logs/artifacts/workspace/  collected automatically (scripts, configs, logs)
+#   - /home/agent/workspace/      collected via config below (includes model weights)
+#
+# Uncomment the `artifacts` section to download the full agent workspace,
+# including model weight files. Warning: this can be many GB.
+
+jobs_dir: jobs
+n_attempts: 1
+
+# artifacts:
+#   - source: /home/agent/workspace
+#     destination: full-workspace
+
+tasks:
+  - path: {task_dir}
+
+agents:
+  - name: claude-code
+    model: anthropic/claude-sonnet-4-6
+
+# environment:
+#   type: modal   # Use Modal for cloud GPU runs
+"""
+        job_yaml_path = task_dir / "job.yaml"
+        job_yaml_path.write_text(job_yaml)
+        return job_yaml_path
+
     def generate_all_tasks(self) -> list[Path]:
         """Generate tasks for all benchmark + model combinations."""
         tasks = []
diff --git a/src/harbor_adapter/template/tests/test.sh b/src/harbor_adapter/template/tests/test.sh
index 76d4371..e1f8cff 100644
--- a/src/harbor_adapter/template/tests/test.sh
+++ b/src/harbor_adapter/template/tests/test.sh
@@ -5,11 +5,31 @@ set -e
 # Runs contamination judge (codex CLI) and 3-phase evaluation with retry logic.
 # Matches the original run_task.sh evaluation pipeline.
 
+# SHA256 of evaluate.py at task-generation time (injected by adapter.py).
+# If the agent has tampered with evaluate.py this check will catch it.
+EVALUATE_PY_SHA256="PLACEHOLDER_SHA256"
+
 WORKSPACE="/home/agent/workspace"
 LOGS_DIR="/logs/verifier"
 
 mkdir -p "$LOGS_DIR"
 
+# ============================================================
+# Verifier isolation: verify evaluate.py integrity
+# ============================================================
+echo "=== Verifying evaluate.py integrity ==="
+ACTUAL_SHA=$(sha256sum "$WORKSPACE/evaluate.py" 2>/dev/null | awk '{print $1}')
+if [ "$ACTUAL_SHA" != "$EVALUATE_PY_SHA256" ]; then
+    echo "ERROR: evaluate.py has been modified by the agent"
+    echo "  expected: $EVALUATE_PY_SHA256"
+    echo "  actual:   $ACTUAL_SHA"
+    echo "Possible reward hacking — setting score to 0."
+    echo '{"error": "evaluate.py tampered with by agent", "accuracy": 0}' > "$LOGS_DIR/metrics.json"
+    echo "0" > "$LOGS_DIR/reward.txt"
+    exit 0
+fi
+echo "evaluate.py integrity OK ($EVALUATE_PY_SHA256)"
+
 echo "=== PostTrainBench Verifier ==="
 echo "Workspace: $WORKSPACE"
 echo "Logs dir: $LOGS_DIR"
@@ -274,3 +294,33 @@ echo ""
 echo "=== Verification complete ==="
 echo "Results in $LOGS_DIR/"
 ls -la "$LOGS_DIR/"
+
+# ============================================================
+# Artifact collection: copy workspace to /logs/artifacts/
+# Excludes large model weight files (*.safetensors, *.bin, *.pt)
+# so that Harbor can auto-collect scripts, configs, and logs
+# without downloading multi-GB model shards.
+# The full model is still available via Harbor's config-driven
+# artifact collection (see job.yaml: artifacts: /home/agent/workspace).
+# ============================================================
+echo ""
+echo "=== Collecting artifacts ==="
+ARTIFACTS_DIR="/logs/artifacts/workspace"
+mkdir -p "$ARTIFACTS_DIR"
+
+find "$WORKSPACE" -type f \
+    ! -name "*.safetensors" \
+    ! -name "*.bin" \
+    ! -name "*.pt" \
+    ! -name "*.pth" \
+    ! -name "*.ckpt" \
+    ! -path "*/__pycache__/*" \
+    ! -path "*/.cache/*" \
+    | while IFS= read -r src; do
+        rel="${src#$WORKSPACE/}"
+        dst_dir="$ARTIFACTS_DIR/$(dirname "$rel")"
+        mkdir -p "$dst_dir"
+        cp "$src" "$dst_dir/" 2>/dev/null || true
+    done
+
+echo "Artifacts collected (model weights excluded) — see /logs/artifacts/workspace/"