Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions src/harbor_adapter/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -160,3 +160,29 @@ The verifier extracts the accuracy metric from `metrics.json` as the reward (0-1
- `/logs/verifier/reward.txt` - Accuracy score
- `/logs/verifier/contamination_judgement.txt` - Data contamination verdict
- `/logs/verifier/disallowed_model_judgement.txt` - Model usage verdict

## Verifier Isolation

Harbor runs the verifier in the same container as the agent. To mitigate reward hacking via `evaluate.py` modification, the adapter embeds a SHA256 hash of `evaluate.py` into `test.sh` at task-generation time. If the agent modifies `evaluate.py`, the verifier detects the mismatch and outputs a score of 0.

The `tests/` directory (containing `test.sh`) is copied by Harbor separately from the agent's workspace, so the agent cannot alter the verifier script.

## Artifact Collection

Two levels of artifact collection are provided:

**Automatic (convention directory):** After verification, `test.sh` copies the agent's workspace to `/logs/artifacts/workspace/`, excluding large model weight files (`*.safetensors`, `*.bin`, `*.pt`, `*.pth`, `*.ckpt`). Harbor auto-collects `/logs/artifacts/` with no extra configuration.

**Full workspace (config-driven):** Each generated task includes a `job.yaml` with a commented-out `artifacts` block. Uncomment it to also download the complete agent workspace including model weights:

```yaml
artifacts:
- source: /home/agent/workspace
destination: full-workspace
```

Run the task using the job.yaml instead of passing flags manually:

```bash
harbor run -c ./tasks/posttrainbench-gsm8k-qwen3-1.7b/job.yaml
```
65 changes: 61 additions & 4 deletions src/harbor_adapter/adapter.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import hashlib
import json
import shutil
from dataclasses import dataclass
Expand Down Expand Up @@ -285,14 +286,26 @@ def generate_environment(
metadata_path.write_text(json.dumps(metadata, indent=2))

def generate_tests(self, task_dir: Path) -> None:
"""Generate the tests directory with verification script."""
"""Generate the tests directory with verification script.

Computes the SHA256 of evaluate.py at task-generation time and embeds
it into test.sh so the verifier can detect if the agent tampered with
the evaluation script (reward hacking mitigation).
"""
tests_dir = task_dir / "tests"
tests_dir.mkdir(parents=True, exist_ok=True)

# Copy test.sh
# Compute SHA256 of the evaluate.py that was copied into the environment
evaluate_py = task_dir / "environment" / "evaluate.py"
sha256 = hashlib.sha256(evaluate_py.read_bytes()).hexdigest()

# Read template, inject hash, and write
test_sh_src = TEMPLATE_DIR / "tests" / "test.sh"
content = test_sh_src.read_text()
content = content.replace("PLACEHOLDER_SHA256", sha256)

test_sh_dst = tests_dir / "test.sh"
shutil.copy(test_sh_src, test_sh_dst)
test_sh_dst.write_text(content)
test_sh_dst.chmod(0o755)

def generate_task(
Expand Down Expand Up @@ -339,11 +352,55 @@ def generate_task(
self.generate_task_toml(task_dir, benchmark_id)
self.generate_instruction(task_dir, model_info, benchmark_info, benchmark_id)
self.generate_environment(task_dir, benchmark_id, model_info, benchmark_info)
self.generate_tests(task_dir)
self.generate_tests(task_dir) # must come after generate_environment (needs evaluate.py)
self.generate_job_yaml(task_dir, benchmark_id, model_info)

print(f"Task generated at: {task_dir}")
return task_dir

def generate_job_yaml(self, task_dir: Path, benchmark_id: str, model_info: "ModelInfo") -> Path:
"""Generate a job.yaml alongside the task directory.

The job.yaml provides config-driven artifact collection for the full
agent workspace (including model weights), which cannot be auto-collected
via /logs/artifacts/ due to size. Users can run with this config instead
of passing all flags by hand:

harbor run -c <task_dir>/job.yaml
"""
task_name = task_dir.name
job_yaml = f"""\
# Harbor job configuration for {task_name}
# Run with: harbor run -c {task_dir}/job.yaml
#
# Artifact collection:
# - /logs/artifacts/workspace/ collected automatically (scripts, configs, logs)
# - /home/agent/workspace/ collected via config below (includes model weights)
#
# Uncomment the `artifacts` section to download the full agent workspace,
# including model weight files. Warning: this can be many GB.

jobs_dir: jobs
n_attempts: 1

# artifacts:
# - source: /home/agent/workspace
# destination: full-workspace

tasks:
- path: {task_dir}

agents:
- name: claude-code
model: anthropic/claude-sonnet-4-6

# environment:
# type: modal # Use Modal for cloud GPU runs
"""
job_yaml_path = task_dir / "job.yaml"
job_yaml_path.write_text(job_yaml)
return job_yaml_path

def generate_all_tasks(self) -> list[Path]:
"""Generate tasks for all benchmark + model combinations."""
tasks = []
Expand Down
50 changes: 50 additions & 0 deletions src/harbor_adapter/template/tests/test.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,31 @@ set -e
# Runs contamination judge (codex CLI) and 3-phase evaluation with retry logic.
# Matches the original run_task.sh evaluation pipeline.

# SHA256 of evaluate.py at task-generation time (injected by adapter.py).
# If the agent has tampered with evaluate.py this check will catch it.
EVALUATE_PY_SHA256="PLACEHOLDER_SHA256"

WORKSPACE="/home/agent/workspace"
LOGS_DIR="/logs/verifier"

mkdir -p "$LOGS_DIR"

# ============================================================
# Verifier isolation: verify evaluate.py integrity
# ============================================================
echo "=== Verifying evaluate.py integrity ==="
ACTUAL_SHA=$(sha256sum "$WORKSPACE/evaluate.py" 2>/dev/null | awk '{print $1}')
if [ "$ACTUAL_SHA" != "$EVALUATE_PY_SHA256" ]; then
echo "ERROR: evaluate.py has been modified by the agent"
echo " expected: $EVALUATE_PY_SHA256"
echo " actual: $ACTUAL_SHA"
echo "Possible reward hacking — setting score to 0."
echo '{"error": "evaluate.py tampered with by agent", "accuracy": 0}' > "$LOGS_DIR/metrics.json"
echo "0" > "$LOGS_DIR/reward.txt"
exit 0
fi
echo "evaluate.py integrity OK ($EVALUATE_PY_SHA256)"

echo "=== PostTrainBench Verifier ==="
echo "Workspace: $WORKSPACE"
echo "Logs dir: $LOGS_DIR"
Expand Down Expand Up @@ -274,3 +294,33 @@ echo ""
echo "=== Verification complete ==="
echo "Results in $LOGS_DIR/"
ls -la "$LOGS_DIR/"

# ============================================================
# Artifact collection: copy workspace to /logs/artifacts/
# Excludes large model weight files (*.safetensors, *.bin, *.pt)
# so that Harbor can auto-collect scripts, configs, and logs
# without downloading multi-GB model shards.
# The full model is still available via Harbor's config-driven
# artifact collection (see job.yaml: artifacts: /home/agent/workspace).
# ============================================================
echo ""
echo "=== Collecting artifacts ==="
ARTIFACTS_DIR="/logs/artifacts/workspace"
mkdir -p "$ARTIFACTS_DIR"

find "$WORKSPACE" -type f \
! -name "*.safetensors" \
! -name "*.bin" \
! -name "*.pt" \
! -name "*.pth" \
! -name "*.ckpt" \
! -path "*/__pycache__/*" \
! -path "*/.cache/*" \
| while IFS= read -r src; do
rel="${src#$WORKSPACE/}"
dst_dir="$ARTIFACTS_DIR/$(dirname "$rel")"
mkdir -p "$dst_dir"
cp "$src" "$dst_dir/" 2>/dev/null || true
done

echo "Artifacts collected (model weights excluded) — see /logs/artifacts/workspace/"