ChicagoHAI · JudyZhu45 · May 1, 2026 · May 1, 2026 · May 1, 2026 · May 2, 2026
diff --git a/templates/agents/resource_finder.txt b/templates/agents/resource_finder.txt
@@ -385,7 +385,31 @@ Check these sources in order:
 TARGET: Find suitable datasets for the research task
 PREFER: Established benchmarks over custom data
 
-STEP 3: Download and Validate Datasets
+STEP 3: Verify Dataset Accessibility (Pre-flight Check)
+─────────────────────────────────────────────────────────────────────────────
+
+Before downloading, verify each candidate dataset actually exists and is
+reachable. This avoids wasting time on broken links or moved datasets.
+
+For each candidate from STEP 2:
+
+    python .claude/skills/dataset-verifier/scripts/verify_dataset.py <id_or_url>
+
+Examples:
+    python .claude/skills/dataset-verifier/scripts/verify_dataset.py glue
+    python .claude/skills/dataset-verifier/scripts/verify_dataset.py https://www.kaggle.com/datasets/uciml/iris
+
+Output is JSON with {id, exists, url, error}. Exit code 0 = exists, 1 = not.
+
+DECISION RULE:
+- exists=true  → proceed to STEP 4 (download)
+- exists=false → drop the candidate, log the error in resources.md, pick the
+                 next best candidate from STEP 2 results
+
+Run the verifier on ALL candidates before any download. A failed verify is
+cheap (single HTTP HEAD); a failed download wastes minutes.
+
+STEP 4: Download and Validate Datasets
 ─────────────────────────────────────────────────────────────────────────────
 
 IMPORTANT: Git-Friendly Dataset Handling
@@ -515,7 +539,7 @@ For each dataset:
        json.dump(samples, f, indent=2)
    ```
 
-STEP 4: Exploratory Data Analysis (Quick Check)
+STEP 5: Exploratory Data Analysis (Quick Check)
 ─────────────────────────────────────────────────────────────────────────────
 
 For each dataset:

diff --git a/templates/skills/dataset-verifier/SKILL.md b/templates/skills/dataset-verifier/SKILL.md
@@ -0,0 +1,77 @@
+---
+name: dataset-verifier
+description: Verify a dataset id or URL exists and is reachable before download. Use during PHASE 2 of resource finding to filter inaccessible candidates.
+---
+
+# Dataset Verifier
+
+Pre-flight check that a dataset actually exists and is reachable before
+the experiment runner commits to download.
+
+## When to Use
+
+- During PHASE 2 of resource_finder, after candidate datasets are identified
+  but before download
+- When the agent considers a dataset citation but is unsure if it's accessible
+
+## How to Call
+
+```bash
+python .claude/skills/dataset-verifier/scripts/verify_dataset.py <id_or_url>
+```
+
+Examples:
+```bash
+# HuggingFace dataset (bare name or hf:// prefix)
+python verify_dataset.py glue
+python verify_dataset.py hf://datasets/squad
+
+# Kaggle dataset
+python verify_dataset.py kaggle://uciml/iris
+python verify_dataset.py https://www.kaggle.com/datasets/uciml/iris
+
+# Generic URL
+python verify_dataset.py https://example.com/dataset.tar.gz
+```
+
+## Output
+
+JSON to stdout:
+```json
+{
+  "id": "glue",
+  "exists": true,
+  "url": "https://huggingface.co/datasets/glue",
+  "error": null
+}
+```
+
+Exit code: 0 if exists, 1 otherwise.
+
+## Source Detection
+
+- `hf://...` or bare alphanumeric name → HuggingFace Hub
+- `kaggle://...` or `kaggle.com/datasets/...` → Kaggle
+- Otherwise → generic HTTP
+
+For existence checks, the script uses a single streaming HTTP GET (only
+headers are fetched, body is discarded). This avoids servers that mishandle
+HEAD (e.g. Kaggle returns 404 for HEAD on valid dataset pages).
+
+**HuggingFace specifically**: the existence check hits the API endpoint
+`/api/datasets/<slug>` rather than the web URL `/datasets/<slug>`. Several
+legacy canonical names (e.g. `wikitext`, `cnn_dailymail`, `xsum`) return
+404 on the web URL but exist via the API and load via `load_dataset()`.
+The `url` field in the JSON output still reports the web URL for human use.
+
+## Limitations
+
+- Only checks HTTP reachability — does not verify license, size, gating,
+  or file format. Those are downstream concerns.
+- No retry on transient errors. If the agent needs robustness, retry at
+  the call site.
+- HuggingFace URLs should point to the dataset root (e.g.
+  `https://huggingface.co/datasets/glue`), not subpaths like `/viewer`,
+  `/tree/main`, or `/blob/main/...`. Subpaths are interpreted as part of
+  the dataset slug and produce false negatives. Use the bare slug or the
+  root URL when in doubt.
diff --git a/templates/skills/dataset-verifier/scripts/verify_dataset.py b/templates/skills/dataset-verifier/scripts/verify_dataset.py
@@ -0,0 +1,85 @@
+#!/usr/bin/env python3
+"""
+Dataset existence verifier.
+
+Pre-flight check that a dataset id or URL is reachable before download.
+Supports HuggingFace Hub, Kaggle, and generic URLs.
+
+Usage:
+    python verify_dataset.py <id_or_url>
+
+Output: JSON to stdout with {id, exists, url, error}.
+Exit code: 0 if exists, 1 otherwise.
+"""
+
+import sys
+import json
+import re
+import argparse
+
+
+def _resolve_url(identifier: str) -> tuple[str, str, str]:
+    """Normalize to (id, web_url, check_url).
+
+    web_url is what the user visits; check_url is what we GET for existence.
+    For HuggingFace, check_url uses the API endpoint because some legacy
+    canonical dataset names (e.g. 'wikitext') 404 on the web URL but exist
+    via the API and through load_dataset().
+    """
+    s = identifier.strip()
+
+    m = re.match(r"^(?:hf|huggingface)://(?:datasets/)?(.+)$", s)
+    if m:
+        ds = m.group(1).strip("/")
+        return ds, f"https://huggingface.co/datasets/{ds}", f"https://huggingface.co/api/datasets/{ds}"
+
+    m = re.match(r"^kaggle://(.+)$", s)
+    if m:
+        ds = m.group(1).strip("/")
+        web = f"https://www.kaggle.com/datasets/{ds}"
+        return ds, web, web
+
+    if s.startswith(("http://", "https://")):
+        m = re.search(r"huggingface\.co/datasets/([^?#]+)", s)
+        if m:
+            slug = m.group(1).rstrip("/")
+            return s, s, f"https://huggingface.co/api/datasets/{slug}"
+        return s, s, s
+
+    if re.match(r"^[a-zA-Z0-9_\-/]+$", s):
+        return s, f"https://huggingface.co/datasets/{s}", f"https://huggingface.co/api/datasets/{s}"
+
+    return s, s, s
+
+
+def verify(identifier: str) -> dict:
+    ds_id, web_url, check_url = _resolve_url(identifier)
+    try:
+        import httpx
+    except ImportError:
+        return {"id": ds_id, "exists": False, "url": web_url, "error": "httpx not installed"}
+
+    try:
+        with httpx.Client(timeout=30.0, follow_redirects=True) as client:
+            with client.stream("GET", check_url) as response:
+                status = response.status_code
+    except httpx.RequestError as e:
+        return {"id": ds_id, "exists": False, "url": web_url, "error": f"network error: {type(e).__name__}"}
+
+    if 200 <= status < 300:
+        return {"id": ds_id, "exists": True, "url": web_url, "error": None}
+    return {"id": ds_id, "exists": False, "url": web_url, "error": f"HTTP {status}"}
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Verify dataset existence")
+    parser.add_argument("identifier", help="Dataset id, hf:// URL, kaggle:// URL, or generic HTTPS URL")
+    args = parser.parse_args()
+
+    result = verify(args.identifier)
+    print(json.dumps(result, indent=2))
+    sys.exit(0 if result["exists"] else 1)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/templates/skills/paper-finder/scripts/find_papers.py b/templates/skills/paper-finder/scripts/find_papers.py
@@ -14,9 +14,87 @@
 import json
 import os
 import re
+import math
+import time
+import random
 import argparse
 from datetime import datetime
 
+# Transient errors: rate-limit or server hiccups
+RETRYABLE_STATUS = {429, 500, 502, 503, 504}
+MAX_ATTEMPTS = 5
+BASE_DELAY = 1.0
+MAX_DELAY = 60.0
+
+def score_paper(paper, current_year=None):
+    if current_year is None:
+        current_year = datetime.now().year
+    relevance = paper.get("relevance", 0)
+    citations = paper.get("influential_citations", 0)
+    year = paper.get("year")
+    score = float(relevance)
+    score += min(math.log1p(citations), 2.0)
+    if year:
+        age = current_year - year
+        score += 0.5 * math.exp(-age / 5.0)
+    return score
+
+
+def _dedup_key(paper):
+    if paper.get("doi"):
+        return paper["doi"]
+    if paper.get("url"):
+        return paper["url"]
+    return (paper.get("title", "").lower(), paper.get("year"))
+
+
+def _dedup_papers(papers):
+    seen = set()
+    deduped = []
+    for paper in papers:
+        key = _dedup_key(paper)
+        if key not in seen:
+            seen.add(key)
+            deduped.append(paper)
+    return deduped
+
+
+def _backoff_delay(attempt):
+    delay = min(BASE_DELAY * (2 ** (attempt - 1)), MAX_DELAY)
+    return delay * random.uniform(0.5, 1.5)
+
+
+def _post_with_retry(client, url, payload):
+    """POST to url with exponential backoff and ±50% jitter.
+
+    Retries on:
+    - Response status codes in RETRYABLE_STATUS (429, 500, 502, 503, 504)
+    - httpx.RequestError (connection errors, timeouts, etc.)
+
+    Returns immediately without retrying on other 4xx responses (e.g. 404).
+    After MAX_ATTEMPTS, returns the final response for status-code errors;
+    re-raises the exception for httpx.RequestError.
+    """
+    import httpx  # Lazy import: find_papers() has verified availability
+    for attempt in range(1, MAX_ATTEMPTS + 1):
+        try:
+            response = client.post(url, json=payload)
+            if response.status_code not in RETRYABLE_STATUS:
+                return response
+            if attempt == MAX_ATTEMPTS:
+                return response
+            reason = f"status {response.status_code}"
+        except httpx.RequestError as e:
+            if attempt == MAX_ATTEMPTS:
+                raise
+            reason = type(e).__name__
+        delay = _backoff_delay(attempt)
+        print(
+            f"Retry {attempt}/{MAX_ATTEMPTS - 1}: {reason}, retrying in {delay:.1f}s",
+            file=sys.stderr,
+        )
+        time.sleep(delay)
+
 
 def find_papers(query: str, mode: str = "fast", url: str = "http://localhost:8000/api/2/rounds"):
     """Call paper-finder API and return formatted results."""
@@ -27,7 +105,7 @@ def find_papers(query: str, mode: str = "fast", url: str = "http://localhost:800
 
     try:
         with httpx.Client(timeout=300.0) as client:
-            response = client.post(url, json={
+            response = _post_with_retry(client, url, {
                 "paper_description": query,
                 "operation_mode": mode,
                 "read_results_from_cache": True
@@ -64,9 +142,17 @@ def find_papers(query: str, mode: str = "fast", url: str = "http://localhost:800
             "url": doc.get('url', ''),
             "relevance": rel,
             "abstract": (doc.get('abstract') or ''),
-            "citations": doc.get('citation_count', 0) or 0
+            "citations": doc.get('citation_count', 0) or 0,
+            "venue": doc.get('venue', ''),
+            "influential_citations": doc.get('influential_citation_count', 0) or 0,
         })
 
+    for paper in results["papers"]:
+        paper["score"] = score_paper(paper)
+    results["papers"].sort(key=lambda p: -p["score"])
+    results["papers"] = _dedup_papers(results["papers"])
+    results["total"] = len(results["papers"])
+
     return results