diff --git a/templates/agents/resource_finder.txt b/templates/agents/resource_finder.txt index 75cb082..2dafbbb 100644 --- a/templates/agents/resource_finder.txt +++ b/templates/agents/resource_finder.txt @@ -385,7 +385,31 @@ Check these sources in order: TARGET: Find suitable datasets for the research task PREFER: Established benchmarks over custom data -STEP 3: Download and Validate Datasets +STEP 3: Verify Dataset Accessibility (Pre-flight Check) +───────────────────────────────────────────────────────────────────────────── + +Before downloading, verify each candidate dataset actually exists and is +reachable. This avoids wasting time on broken links or moved datasets. + +For each candidate from STEP 2: + + python .claude/skills/dataset-verifier/scripts/verify_dataset.py + +Examples: + python .claude/skills/dataset-verifier/scripts/verify_dataset.py glue + python .claude/skills/dataset-verifier/scripts/verify_dataset.py https://www.kaggle.com/datasets/uciml/iris + +Output is JSON with {id, exists, url, error}. Exit code 0 = exists, 1 = not. + +DECISION RULE: +- exists=true → proceed to STEP 4 (download) +- exists=false → drop the candidate, log the error in resources.md, pick the + next best candidate from STEP 2 results + +Run the verifier on ALL candidates before any download. A failed verify is +cheap (single HTTP HEAD); a failed download wastes minutes. + +STEP 4: Download and Validate Datasets ───────────────────────────────────────────────────────────────────────────── IMPORTANT: Git-Friendly Dataset Handling @@ -515,7 +539,7 @@ For each dataset: json.dump(samples, f, indent=2) ``` -STEP 4: Exploratory Data Analysis (Quick Check) +STEP 5: Exploratory Data Analysis (Quick Check) ───────────────────────────────────────────────────────────────────────────── For each dataset: diff --git a/templates/skills/dataset-verifier/SKILL.md b/templates/skills/dataset-verifier/SKILL.md new file mode 100644 index 0000000..7ef35bf --- /dev/null +++ b/templates/skills/dataset-verifier/SKILL.md @@ -0,0 +1,77 @@ +--- +name: dataset-verifier +description: Verify a dataset id or URL exists and is reachable before download. Use during PHASE 2 of resource finding to filter inaccessible candidates. +--- + +# Dataset Verifier + +Pre-flight check that a dataset actually exists and is reachable before +the experiment runner commits to download. + +## When to Use + +- During PHASE 2 of resource_finder, after candidate datasets are identified + but before download +- When the agent considers a dataset citation but is unsure if it's accessible + +## How to Call + +```bash +python .claude/skills/dataset-verifier/scripts/verify_dataset.py +``` + +Examples: +```bash +# HuggingFace dataset (bare name or hf:// prefix) +python verify_dataset.py glue +python verify_dataset.py hf://datasets/squad + +# Kaggle dataset +python verify_dataset.py kaggle://uciml/iris +python verify_dataset.py https://www.kaggle.com/datasets/uciml/iris + +# Generic URL +python verify_dataset.py https://example.com/dataset.tar.gz +``` + +## Output + +JSON to stdout: +```json +{ + "id": "glue", + "exists": true, + "url": "https://huggingface.co/datasets/glue", + "error": null +} +``` + +Exit code: 0 if exists, 1 otherwise. + +## Source Detection + +- `hf://...` or bare alphanumeric name → HuggingFace Hub +- `kaggle://...` or `kaggle.com/datasets/...` → Kaggle +- Otherwise → generic HTTP + +For existence checks, the script uses a single streaming HTTP GET (only +headers are fetched, body is discarded). This avoids servers that mishandle +HEAD (e.g. Kaggle returns 404 for HEAD on valid dataset pages). + +**HuggingFace specifically**: the existence check hits the API endpoint +`/api/datasets/` rather than the web URL `/datasets/`. Several +legacy canonical names (e.g. `wikitext`, `cnn_dailymail`, `xsum`) return +404 on the web URL but exist via the API and load via `load_dataset()`. +The `url` field in the JSON output still reports the web URL for human use. + +## Limitations + +- Only checks HTTP reachability — does not verify license, size, gating, + or file format. Those are downstream concerns. +- No retry on transient errors. If the agent needs robustness, retry at + the call site. +- HuggingFace URLs should point to the dataset root (e.g. + `https://huggingface.co/datasets/glue`), not subpaths like `/viewer`, + `/tree/main`, or `/blob/main/...`. Subpaths are interpreted as part of + the dataset slug and produce false negatives. Use the bare slug or the + root URL when in doubt. diff --git a/templates/skills/dataset-verifier/scripts/verify_dataset.py b/templates/skills/dataset-verifier/scripts/verify_dataset.py new file mode 100644 index 0000000..69ef85a --- /dev/null +++ b/templates/skills/dataset-verifier/scripts/verify_dataset.py @@ -0,0 +1,85 @@ +#!/usr/bin/env python3 +""" +Dataset existence verifier. + +Pre-flight check that a dataset id or URL is reachable before download. +Supports HuggingFace Hub, Kaggle, and generic URLs. + +Usage: + python verify_dataset.py + +Output: JSON to stdout with {id, exists, url, error}. +Exit code: 0 if exists, 1 otherwise. +""" + +import sys +import json +import re +import argparse + + +def _resolve_url(identifier: str) -> tuple[str, str, str]: + """Normalize to (id, web_url, check_url). + + web_url is what the user visits; check_url is what we GET for existence. + For HuggingFace, check_url uses the API endpoint because some legacy + canonical dataset names (e.g. 'wikitext') 404 on the web URL but exist + via the API and through load_dataset(). + """ + s = identifier.strip() + + m = re.match(r"^(?:hf|huggingface)://(?:datasets/)?(.+)$", s) + if m: + ds = m.group(1).strip("/") + return ds, f"https://huggingface.co/datasets/{ds}", f"https://huggingface.co/api/datasets/{ds}" + + m = re.match(r"^kaggle://(.+)$", s) + if m: + ds = m.group(1).strip("/") + web = f"https://www.kaggle.com/datasets/{ds}" + return ds, web, web + + if s.startswith(("http://", "https://")): + m = re.search(r"huggingface\.co/datasets/([^?#]+)", s) + if m: + slug = m.group(1).rstrip("/") + return s, s, f"https://huggingface.co/api/datasets/{slug}" + return s, s, s + + if re.match(r"^[a-zA-Z0-9_\-/]+$", s): + return s, f"https://huggingface.co/datasets/{s}", f"https://huggingface.co/api/datasets/{s}" + + return s, s, s + + +def verify(identifier: str) -> dict: + ds_id, web_url, check_url = _resolve_url(identifier) + try: + import httpx + except ImportError: + return {"id": ds_id, "exists": False, "url": web_url, "error": "httpx not installed"} + + try: + with httpx.Client(timeout=30.0, follow_redirects=True) as client: + with client.stream("GET", check_url) as response: + status = response.status_code + except httpx.RequestError as e: + return {"id": ds_id, "exists": False, "url": web_url, "error": f"network error: {type(e).__name__}"} + + if 200 <= status < 300: + return {"id": ds_id, "exists": True, "url": web_url, "error": None} + return {"id": ds_id, "exists": False, "url": web_url, "error": f"HTTP {status}"} + + +def main(): + parser = argparse.ArgumentParser(description="Verify dataset existence") + parser.add_argument("identifier", help="Dataset id, hf:// URL, kaggle:// URL, or generic HTTPS URL") + args = parser.parse_args() + + result = verify(args.identifier) + print(json.dumps(result, indent=2)) + sys.exit(0 if result["exists"] else 1) + + +if __name__ == "__main__": + main() diff --git a/templates/skills/paper-finder/scripts/find_papers.py b/templates/skills/paper-finder/scripts/find_papers.py index ced8493..d2ed730 100644 --- a/templates/skills/paper-finder/scripts/find_papers.py +++ b/templates/skills/paper-finder/scripts/find_papers.py @@ -14,9 +14,87 @@ import json import os import re +import math +import time +import random import argparse from datetime import datetime +# Transient errors: rate-limit or server hiccups +RETRYABLE_STATUS = {429, 500, 502, 503, 504} +MAX_ATTEMPTS = 5 +BASE_DELAY = 1.0 +MAX_DELAY = 60.0 + +def score_paper(paper, current_year=None): + if current_year is None: + current_year = datetime.now().year + relevance = paper.get("relevance", 0) + citations = paper.get("influential_citations", 0) + year = paper.get("year") + score = float(relevance) + score += min(math.log1p(citations), 2.0) + if year: + age = current_year - year + score += 0.5 * math.exp(-age / 5.0) + return score + + +def _dedup_key(paper): + if paper.get("doi"): + return paper["doi"] + if paper.get("url"): + return paper["url"] + return (paper.get("title", "").lower(), paper.get("year")) + + +def _dedup_papers(papers): + seen = set() + deduped = [] + for paper in papers: + key = _dedup_key(paper) + if key not in seen: + seen.add(key) + deduped.append(paper) + return deduped + + +def _backoff_delay(attempt): + delay = min(BASE_DELAY * (2 ** (attempt - 1)), MAX_DELAY) + return delay * random.uniform(0.5, 1.5) + + +def _post_with_retry(client, url, payload): + """POST to url with exponential backoff and ±50% jitter. + + Retries on: + - Response status codes in RETRYABLE_STATUS (429, 500, 502, 503, 504) + - httpx.RequestError (connection errors, timeouts, etc.) + + Returns immediately without retrying on other 4xx responses (e.g. 404). + After MAX_ATTEMPTS, returns the final response for status-code errors; + re-raises the exception for httpx.RequestError. + """ + import httpx # Lazy import: find_papers() has verified availability + for attempt in range(1, MAX_ATTEMPTS + 1): + try: + response = client.post(url, json=payload) + if response.status_code not in RETRYABLE_STATUS: + return response + if attempt == MAX_ATTEMPTS: + return response + reason = f"status {response.status_code}" + except httpx.RequestError as e: + if attempt == MAX_ATTEMPTS: + raise + reason = type(e).__name__ + delay = _backoff_delay(attempt) + print( + f"Retry {attempt}/{MAX_ATTEMPTS - 1}: {reason}, retrying in {delay:.1f}s", + file=sys.stderr, + ) + time.sleep(delay) + def find_papers(query: str, mode: str = "fast", url: str = "http://localhost:8000/api/2/rounds"): """Call paper-finder API and return formatted results.""" @@ -27,7 +105,7 @@ def find_papers(query: str, mode: str = "fast", url: str = "http://localhost:800 try: with httpx.Client(timeout=300.0) as client: - response = client.post(url, json={ + response = _post_with_retry(client, url, { "paper_description": query, "operation_mode": mode, "read_results_from_cache": True @@ -64,9 +142,17 @@ def find_papers(query: str, mode: str = "fast", url: str = "http://localhost:800 "url": doc.get('url', ''), "relevance": rel, "abstract": (doc.get('abstract') or ''), - "citations": doc.get('citation_count', 0) or 0 + "citations": doc.get('citation_count', 0) or 0, + "venue": doc.get('venue', ''), + "influential_citations": doc.get('influential_citation_count', 0) or 0, }) + for paper in results["papers"]: + paper["score"] = score_paper(paper) + results["papers"].sort(key=lambda p: -p["score"]) + results["papers"] = _dedup_papers(results["papers"]) + results["total"] = len(results["papers"]) + return results