diff --git a/src/exgentic/interfaces/cli/commands/batch.py b/src/exgentic/interfaces/cli/commands/batch.py index 0e7fc937..e935534e 100644 --- a/src/exgentic/interfaces/cli/commands/batch.py +++ b/src/exgentic/interfaces/cli/commands/batch.py @@ -924,6 +924,205 @@ def _collect_results_rows( return rows, failures +def _collect_full_results_rows( + config_paths: list[str], +) -> tuple[list[dict[str, Any]], list[tuple[str, str]]]: + """Load full results (including all fields) from config paths. + + Unlike _collect_results_rows, this keeps all fields for EEE conversion. + """ + failures: list[tuple[str, str]] = [] + rows: list[dict[str, Any]] = [] + + for config_path in config_paths: + try: + raw_config = _load_config_file(config_path) + run_id = raw_config.get("run_id") + if not run_id: + raise click.ClickException(f"Missing run_id in config: {config_path}") + output_dir = Path(config_path).parent + results_path = (output_dir / run_id / "results.json").resolve() + if not results_path.is_file(): + raise click.ClickException(f"Results not found: {results_path}") + payload = _load_config_file(str(results_path)) + if not isinstance(payload, dict): + raise click.ClickException(f"Results JSON is not an object: {results_path}") + if "benchmark_score" not in payload: + raise click.ClickException(f"Missing benchmark_score in results: {results_path}") + # Include agent/benchmark from the config for EEE metadata + payload.setdefault("agent", raw_config.get("agent")) + payload.setdefault("benchmark", raw_config.get("benchmark")) + rows.append(payload) + except Exception as exc: + failures.append((config_path, str(exc))) + + return rows, failures + + +# Mapping from exgentic model_name to (developer, clean_model_name, model_id) +_MODEL_DEVELOPER_MAP: dict[str, str] = { + "claude": "Anthropic", + "gpt": "OpenAI", + "gemini": "Google", +} + + +def _parse_model_info(model_name: str) -> tuple[str, str]: + """Extract developer and model ID from exgentic model_name like 'openai/aws/claude-opus-4-5'.""" + # Strip the provider prefix (e.g. "openai/aws/" or "openai/gcp/" or "openai/Azure/") + parts = model_name.split("/") + raw_model = parts[-1] if parts else model_name + + # Determine developer from the model name + developer = "unknown" + lower = raw_model.lower() + for prefix, dev in _MODEL_DEVELOPER_MAP.items(): + if lower.startswith(prefix): + developer = dev + break + + return developer, raw_model + + +def _convert_to_eee( + rows: list[dict[str, Any]], + retrieved_timestamp: str, +) -> list[dict[str, Any]]: + """Convert exgentic results rows to Every Eval Ever EvaluationLog dicts.""" + import re + + eee_logs: list[dict[str, Any]] = [] + + for row in rows: + model_name_raw = row.get("model_name") or "unknown" + developer, model_slug = _parse_model_info(model_name_raw) + model_id = f"{developer.lower()}/{model_slug}" + + benchmark = row.get("benchmark_name") or row.get("benchmark") or "unknown" + agent_name = row.get("agent_name") or row.get("agent") or "unknown" + agent_slug = re.sub(r"[^a-z0-9]+", "-", agent_name.lower()).strip("-") + subset = row.get("subset_name") + + eval_name = benchmark.lower().replace(" ", "-") + if subset: + eval_name = f"{eval_name}/{subset}" + + score = row.get("benchmark_score") + if score is None: + score = row.get("average_score", 0.0) + + # Build score details with uncertainty from session counts + total = row.get("total_sessions") + uncertainty = None + if total and total > 0: + uncertainty = {"num_samples": total} + + # Build evaluation result + eval_result: dict[str, Any] = { + "evaluation_name": eval_name, + "source_data": { + "dataset_name": eval_name, + "source_type": "url", + "url": ["https://github.com/Exgentic/exgentic"], + }, + "metric_config": { + "evaluation_description": f"{benchmark} benchmark evaluation" + + (f" ({subset} subset)" if subset else ""), + "lower_is_better": False, + "score_type": "continuous", + "min_score": 0.0, + "max_score": 1.0, + }, + "score_details": { + "score": round(score, 4) if score is not None else 0.0, + }, + "generation_config": { + "generation_args": { + "agentic_eval_config": { + "additional_details": { + "agent_name": agent_name, + "agent_framework": row.get("agent") or agent_slug, + }, + }, + }, + }, + } + + if uncertainty: + eval_result["score_details"]["uncertainty"] = uncertainty + + # Additional score details + details: dict[str, str] = {} + if row.get("average_agent_cost") is not None: + details["average_agent_cost"] = str(round(row["average_agent_cost"], 2)) + if row.get("total_run_cost") is not None: + details["total_run_cost"] = str(round(row["total_run_cost"], 2)) + if row.get("average_steps") is not None: + details["average_steps"] = str(round(row["average_steps"], 2)) + if row.get("percent_finished") is not None: + details["percent_finished"] = str(round(row["percent_finished"], 4)) + if details: + eval_result["score_details"]["details"] = details + + sanitized_model_id = model_id.replace("/", "_") + evaluation_id = f"{eval_name}/{agent_slug}__{sanitized_model_id}/{retrieved_timestamp}" + + eee_log: dict[str, Any] = { + "schema_version": "0.2.2", + "evaluation_id": evaluation_id, + "retrieved_timestamp": retrieved_timestamp, + "source_metadata": { + "source_name": "Exgentic Open Agent Leaderboard", + "source_type": "evaluation_run", + "source_organization_name": "Exgentic", + "source_organization_url": "https://github.com/Exgentic", + "evaluator_relationship": "third_party", + }, + "eval_library": { + "name": "exgentic", + "version": "0.1.0", + }, + "model_info": { + "name": model_slug, + "id": model_id, + "developer": developer, + "additional_details": { + "agent_name": agent_name, + "agent_framework": row.get("agent") or agent_slug, + }, + }, + "evaluation_results": [eval_result], + } + + eee_logs.append(eee_log) + + return eee_logs + + +def _save_eee_files(eee_logs: list[dict[str, Any]], output_dir: str) -> list[str]: + """Save EEE JSON files to disk in the standard directory structure.""" + import re + import uuid + + saved: list[str] = [] + base = Path(output_dir) + + for log in eee_logs: + model_info = log.get("model_info", {}) + developer = re.sub(r'[<>:"/\\|?*]', "_", model_info.get("developer", "unknown")) + model_name = re.sub(r'[<>:"/\\|?*]', "_", model_info.get("name", "unknown")) + + dir_path = base / developer / model_name + dir_path.mkdir(parents=True, exist_ok=True) + + filepath = dir_path / f"{uuid.uuid4()}.json" + with open(filepath, "w", encoding="utf-8") as f: + json.dump(log, f, indent=2, ensure_ascii=False) + saved.append(str(filepath)) + + return saved + + @batch_cmd.command( "publish", context_settings={"allow_extra_args": True}, @@ -937,7 +1136,7 @@ def _collect_results_rows( @click.option( "--repo", "repo_id", - required=True, + default=None, help="HuggingFace dataset repo ID (e.g. 'Exgentic/open-agent-leaderboard-results').", ) @click.option( @@ -954,63 +1153,126 @@ def _collect_results_rows( show_default=True, help="Append to existing dataset or overwrite it.", ) +@click.option( + "--format", + "fmt", + type=click.Choice(["exgentic", "eee"]), + default="exgentic", + show_default=True, + help="Output format: 'exgentic' (HF dataset) or 'eee' (Every Eval Ever JSON files).", +) +@click.option( + "--output-dir", + "output_dir", + default=None, + help="Directory to save EEE JSON files locally (only used with --format eee).", +) @click.pass_context def batch_publish_cmd( ctx: click.Context, config_values: tuple[str, ...], - repo_id: str, + repo_id: str | None, private: bool, append: bool, + fmt: str, + output_dir: str | None, ) -> None: - """Publish run results to a HuggingFace dataset.""" - try: - from datasets import Dataset, load_dataset - except ImportError as err: - raise click.ClickException( - "The 'datasets' package is required for publishing. Install it with: pip install datasets" - ) from err + """Publish run results to a HuggingFace dataset or in Every Eval Ever format.""" + import time config_paths = _expand_config_inputs(config_values, list(ctx.args)) - rows, failures = _collect_results_rows(config_paths) - if not rows: - raise click.ClickException("No valid results to publish.") + if fmt == "eee": + full_rows, failures = _collect_full_results_rows(config_paths) + if not full_rows: + raise click.ClickException("No valid results to publish.") + + retrieved_timestamp = str(time.time()) + eee_logs = _convert_to_eee(full_rows, retrieved_timestamp) + + # Save locally if output_dir is specified + if output_dir: + saved = _save_eee_files(eee_logs, output_dir) + click.echo(f"Saved {len(saved)} EEE JSON file(s) to {output_dir}/") + + # Push to HF if repo is specified + if repo_id: + try: + from datasets import Dataset, load_dataset + except ImportError as err: + raise click.ClickException( + "The 'datasets' package is required for publishing. Install it with: pip install datasets" + ) from err + + if append: + try: + existing_ds = load_dataset(repo_id, split="train") + existing_rows = list(existing_ds) + click.echo(f"Loaded {len(existing_rows)} existing row(s) from {repo_id}.") + all_rows = existing_rows + eee_logs + except Exception: + click.echo("No existing dataset found, creating new one.") + all_rows = eee_logs + else: + all_rows = eee_logs + + ds = Dataset.from_list(all_rows) + ds.push_to_hub(repo_id, private=private) + click.echo(f"Published {len(all_rows)} EEE row(s) to https://huggingface.co/datasets/{repo_id}") + + if not output_dir and not repo_id: + raise click.ClickException("Specify --output-dir and/or --repo for EEE format.") + + else: + # Original exgentic format + if not repo_id: + raise click.ClickException("--repo is required for exgentic format.") - if append: try: - existing_ds = load_dataset(repo_id, split="train") - existing_rows = list(existing_ds) - click.echo(f"Loaded {len(existing_rows)} existing row(s) from {repo_id}.") - - # Deduplicate by (benchmark, agent, model) triple - existing_keys = set() - for r in existing_rows: - key = (r.get("benchmark"), r.get("agent"), r.get("model")) - existing_keys.add(key) - - new_rows = [] - updated = 0 - for row in rows: - key = (row.get("benchmark"), row.get("agent"), row.get("model")) - if key in existing_keys: - # Replace existing row with updated one - existing_rows = [ - r for r in existing_rows if (r.get("benchmark"), r.get("agent"), r.get("model")) != key - ] - updated += 1 - new_rows.append(row) - - all_rows = existing_rows + new_rows - click.echo(f"Publishing {len(all_rows)} row(s) " f"({len(new_rows) - updated} new, {updated} updated).") - except Exception: - click.echo("No existing dataset found, creating new one.") + from datasets import Dataset, load_dataset + except ImportError as err: + raise click.ClickException( + "The 'datasets' package is required for publishing. Install it with: pip install datasets" + ) from err + + rows, failures = _collect_results_rows(config_paths) + + if not rows: + raise click.ClickException("No valid results to publish.") + + if append: + try: + existing_ds = load_dataset(repo_id, split="train") + existing_rows = list(existing_ds) + click.echo(f"Loaded {len(existing_rows)} existing row(s) from {repo_id}.") + + existing_keys = set() + for r in existing_rows: + key = (r.get("benchmark"), r.get("agent"), r.get("model")) + existing_keys.add(key) + + new_rows = [] + updated = 0 + for row in rows: + key = (row.get("benchmark"), row.get("agent"), row.get("model")) + if key in existing_keys: + existing_rows = [ + r for r in existing_rows if (r.get("benchmark"), r.get("agent"), r.get("model")) != key + ] + updated += 1 + new_rows.append(row) + + all_rows = existing_rows + new_rows + click.echo(f"Publishing {len(all_rows)} row(s) ({len(new_rows) - updated} new, {updated} updated).") + except Exception: + click.echo("No existing dataset found, creating new one.") + all_rows = rows + else: all_rows = rows - else: - all_rows = rows - ds = Dataset.from_list(all_rows) - ds.push_to_hub(repo_id, private=private) - click.echo(f"Published {len(all_rows)} row(s) to https://huggingface.co/datasets/{repo_id}") + ds = Dataset.from_list(all_rows) + ds.push_to_hub(repo_id, private=private) + click.echo(f"Published {len(all_rows)} row(s) to https://huggingface.co/datasets/{repo_id}") if failures: click.echo(f"Warning: {len(failures)} config(s) had errors and were skipped:")