diff --git a/.claude/skills/gepa-optimization/SKILL.md b/.claude/skills/gepa-optimization/SKILL.md new file mode 120000 index 0000000..3957530 --- /dev/null +++ b/.claude/skills/gepa-optimization/SKILL.md @@ -0,0 +1 @@ +../../../scripts/experiments/gepa-flowchart/SKILL.md \ No newline at end of file diff --git a/packages/viewer/src/client/viewer.ts b/packages/viewer/src/client/viewer.ts index 377c540..63dfe09 100644 --- a/packages/viewer/src/client/viewer.ts +++ b/packages/viewer/src/client/viewer.ts @@ -19,17 +19,6 @@ const wsid = isShareMode ? "" : (_seg[1] ?? ""); const apiBase = isShareMode ? `/s/${shareToken}` : `/w/${wsid}`; // Read-only: the built-in /w/demo/ showcase AND every share view. Hides clear/console/share controls. const READONLY = isShareMode || wsid === "demo"; -// Experimental features (agent console / board history) are hidden by default. Reveal with -// `?experimental=1` (sticky via localStorage `tc-experimental`); `?experimental=0` hides them again. -const EXPERIMENTAL = (() => { - try { - const q = new URLSearchParams(location.search).get("experimental"); - if (q !== null) localStorage.setItem("tc-experimental", q === "0" || q === "" ? "" : "1"); - return !!localStorage.getItem("tc-experimental"); - } catch { - return false; - } -})(); // Honor the OS "reduce motion" setting for auto-advancing (`live`) scopes: render one frame then // freeze, instead of looping. `frozenLive` tracks scopes already shown their first frame. const REDUCE_MOTION = typeof matchMedia === "function" && matchMedia("(prefers-reduced-motion: reduce)").matches; @@ -1164,17 +1153,11 @@ if (isShareMode) { document.getElementById("history-toggle")?.remove(); document.getElementById("templates")?.remove(); // keep the public demo from exposing the library } else { - const templatesBtn = document.getElementById("templates") as HTMLButtonElement | null; // not experimental + initConsole(); + const ht = document.getElementById("history-toggle"); + if (ht) ht.onclick = () => toggleHistory(!historyEnabled); + const templatesBtn = document.getElementById("templates") as HTMLButtonElement | null; if (templatesBtn) { templatesBtn.hidden = false; templatesBtn.onclick = () => void openTemplatesPopover(templatesBtn); } - if (EXPERIMENTAL) { - initConsole(); - const ht = document.getElementById("history-toggle"); - if (ht) ht.onclick = () => toggleHistory(!historyEnabled); - } else { - // Experimental features off by default — hide the chat console + board-history toggles entirely. - document.getElementById("console-toggle")?.remove(); - document.getElementById("history-toggle")?.remove(); - } } const shareBtn = document.getElementById("share") as HTMLButtonElement | null; if (shareBtn) shareBtn.onclick = () => void openSharePopover(shareBtn); diff --git a/scripts/experiments/gepa-flowchart/.gitignore b/scripts/experiments/gepa-flowchart/.gitignore index b875e21..977d7b3 100644 --- a/scripts/experiments/gepa-flowchart/.gitignore +++ b/scripts/experiments/gepa-flowchart/.gitignore @@ -2,4 +2,26 @@ __pycache__/ *.pyc .venv/ node_modules/ +.pytest_cache/ + +# GEPA run artifacts — keep source, summaries (*.md), reports, and best_* JSON. +# Everything below is regenerable scratch from a run and is NOT committed. runs/ +**/gepa_state.bin +**/run_log* +**/candidate_tree.html +**/candidates.json +**/generated_best_outputs_valset/ +**/frozen_questions.json +**/frozen_junior.json +overnight/frozen/ + +# scratch example boards +board_*.json +consolidation_board.json +panes_stack.json + +# stray logs + generated corpus +**/*.log +**/*.log.* +overnight/corpus.json diff --git a/scripts/experiments/gepa-flowchart/SKILL.md b/scripts/experiments/gepa-flowchart/SKILL.md new file mode 100644 index 0000000..1fa63ca --- /dev/null +++ b/scripts/experiments/gepa-flowchart/SKILL.md @@ -0,0 +1,174 @@ +--- +name: gepa-optimization +description: Use when running, extending, or reasoning about GEPA prompt optimization for termchart diagram skills — the runbook (commands, env vars, entry points), the scoring/metric design, the OOD-holdout discipline, and the hard-won gotchas. Covers single-prompt, per-journey, and joint topology-skill optimization. +--- + +# GEPA optimization for termchart + +GEPA (reflective prompt evolution, `gepa==0.1.1`) optimizes the **prompt-text +"skills"** termchart uses to author diagrams, scored by **rendering the board in a +real browser and judging it**. This skill is the methodology + runbook so a future +session reproduces a run without re-deriving any of it. + +Everything below runs from `scripts/experiments/gepa-flowchart/`. Package README +covers first-time setup (npm build of the viewer, `pip install -e ".[dev]"`); this +doc is the *how to run it well* layer. + +## What GEPA actually is here (mental model) + +- A GEPA **candidate is a `dict` of named prompt-text components** — topology skills + (`board_layout`, `graph_process_spine`, `graph_entity_lanes`, `graph_zoned_tiers`, + `comparison_grid`, `dashboard_grid`, `report_rows`, `screen_frame`, `chart_internal`), + a universal `artifact_note`, and per-journey tails (`tail_`). +- GEPA does **evolutionary + reflective search over those strings**. It is **NOT** + spinning up Claude Code instances or skills. Three LLM roles, each swappable: + **generator** (authors the board), **judge/reader** (scores it), **reflector** + (proposes prompt edits from feedback). +- A journey's prompt is **composed by walking its topology path**: `board_layout` + + the journey's pattern skill(s) + its tail. Focused composition (only path skills, + no global dump) is what prevents cross-artifact rule leakage. +- Hierarchy: **JOURNEY** (use-case → topology + rubric) → **TOPOLOGY skills** (shared, + GEPA-optimized) → **ATOMS** (covered by input schema, no per-atom skill). + +## Auth (Vertex is the default in practice) + +```bash +source ~/.profile 2>/dev/null # remote viewer/env vars live here, NOT in the Bash tool's env +export CLAUDE_CODE_USE_VERTEX=1 ANTHROPIC_VERTEX_PROJECT_ID=adk-coding-agents \ + CLOUD_ML_REGION=global PYTHONDONTWRITEBYTECODE=1 +gcloud auth application-default login # ADC (once) +``` +`get_client()` auto-selects the Vertex client when `CLAUDE_CODE_USE_VERTEX` (or +`GEPA_USE_VERTEX`) is set; otherwise it uses the direct Anthropic API +(`ANTHROPIC_API_KEY`). Gemini judges use `GOOGLE_CLOUD_PROJECT` / `GOOGLE_CLOUD_LOCATION`. + +## Entry points (`python -m gepa_flowchart.`) + +| Module | Purpose | Key flags | +|---|---|---| +| `run` | Single-prompt optimization (brainstorm+generate over topics) | `--smoke` `--max-metric-calls` `--train 8` `--val` `--topics` `--run-dir` | +| `journey_run` | Optimize ONE journey's components | `--journey ` (req) `--max-metric-calls 90` `--val 2` `--run-dir` | +| `multi_journey_run` | **Joint** GEPA over shared topology skills across journeys | `--group {graph,comparison,dashboard,report,chart,screen}` or `--journeys …`; `--max-metric-calls 200` `--val 1` `--run-dir` (req) `--seed-from ` (warm start) | +| `type_run` | Typed-pipeline run (per diagram type) | see `--help` | +| `holdout_eval` | **OOD gate**: seed vs promoted `SKILL_TEXT` on the untouched holdout, cross-family panel + K=3 | (no args; reads `holdout_journeys.json`) | +| `ood_overexplain ` | OOD eval of a run's `best_topology_skills.json` on **junior + viz** axes | positional run dir (default `overnight/combined_overexplain`) | +| `topology_regression` | 3-arm FLAT / SCHEMA / TOPO regression on a fixed sample | — | +| `crosseval` / `cross_round_validate` | Cross-eval best skills on held-out instances / across rounds | — | +| `judge_agreement` | Inter-judge agreement diagnostics | — | +| `promote_validated` | Promote validated skills into `skill_library.SKILL_TEXT` | — | +| `recover` | Rebuild best candidate from `gepa_state.bin` | — | + +Outputs land in the `--run-dir`: `report.md`, `best_prompts.json` / +`best_topology_skills.json`, `frozen_questions.json`, `gepa_state.bin`, `run_log*.txt`. + +## The metric (unified scorer — `unified_metric.py`) + +Board is rendered in a real browser (viewer + Chromium via `TypedRenderService`) and +scored on axes, gated by structural validity, combined by **weighted harmonic mean**: + +| Axis | What | Default weight | env | +|---|---|---|---| +| comprehension | mean(text VQA, vision VQA) from run-frozen reader questions | `w_comp 0.35` | `GEPA_W_COMP` | +| geometry | mean(heuristic `geometryReport`, rendered-DOM overlaps/offscreen/font) — **dominant** | `w_geom 0.50` | `GEPA_W_GEOM` | +| visual_quality | legibility / crowding / overlaps / clipping (vision) | `w_vq 0.15` | `GEPA_W_VQ` | +| junior | per-journey **junior-comprehension** rubric (over-explain for newcomers) | `GEPA_W_JUNIOR` | gated by `GEPA_JUNIOR_RUBRIC=1` | +| viz | visualization-usage: good prose/diagram mix, products have images, links checked | `GEPA_W_VIZ` | gated by `GEPA_VIZ_USAGE=1` | + +- **Which axes span the score + Pareto frontier**: `GEPA_OBJECTIVES` (default all three + core axes). `GEPA_COMP_FLOOR` guards comprehension when it's excluded from the score. +- **`frontier_type` = "hybrid"** (`GEPA_FRONTIER_TYPE`): Pareto front over both val + instances and objectives. +- Default models: generator/judge/reflection/vision `claude-opus-4-8`, reader + `claude-sonnet-4-6`. Override with `GEPA_GEN_MODEL` / `GEPA_JUDGE_MODEL` / + `GEPA_REFLECTION_MODEL` / `GEPA_VISION_MODEL` / `GEPA_READER_MODEL`. + +### Anti-Goodhart machinery (use it — don't defeat it) + +- **PoLL panel** (Verga 2024): `GEPA_JUDGE_PANEL="m1,m2,…"` → per-axis **median** across + judges + **inter-judge disagreement** signal + **per-axis abstain** on empty responses. + Empty panel falls back to `[vision_model]`. +- **K-sample generation**: `GEPA_GEN_SAMPLES=K` → median over K rolls (`score_sampled`) + to average out *generation* variance, the dominant noise source. +- **Optimize-judge ≠ validation-judge.** Optimize with a cheap judge (e.g. Gemini + Flash); **validate/gate with a different, cross-family panel** (`GATE_PANEL` in + `holdout_eval`/`ood_overexplain`, K≥3). If both are the same model, you're optimizing + the judge's blind spots. + +## The OOD-holdout discipline (the gate — non-negotiable) + +- `holdout_journeys.json` is **10 out-of-distribution use-cases** kept in a **separate + file so optimization physically cannot load it**. **Never** add a holdout journey to + a trainset or valset. It is the only honest generalization signal. +- Validate with `holdout_eval` (promoted-skills gate) or `ood_overexplain ` + (per-run junior/viz). Both use the cross-family gate panel + K≥3. +- **Noise-floor verdict**: compare **changed-skill** journeys vs **control** journeys + (whose composed prompt didn't change) in the same run. A lift only counts if it clears + the control spread. Caveat: if a skill is in *every* journey's path (e.g. + `board_layout`), there are **zero controls → no measured noise floor** — say so and + treat marginal deltas as unresolved, not wins. +- **Don't game the metrics.** Two burned lessons: (1) tightening `graph_process_spine` + cut variance but **regressed** OOD quality (patient-triage 0.367→0.203) — *variance-down + ≠ quality-up*; (2) a viz-protective re-weight made GEPA change **nothing**, proving the + junior/viz tension is fundamental for one universal `board_layout`. If a change only + moves the number without moving real readability, revert it. + +## Running a real job (background, rate-limit-safe) + +Detached launch template — the script must be the **FIRST** command in the heredoc, no +leading `pkill` (a nonzero exit aborts the whole chain): + +```bash +cat > "$CLAUDE_JOB_DIR/tmp/run_x.sh" <<'EOF' +#!/usr/bin/env bash +cd /home/ivanmkc/termchart/.claude/worktrees/gepa-flowchart/scripts/experiments/gepa-flowchart +source ~/.profile 2>/dev/null +export CLAUDE_CODE_USE_VERTEX=1 ANTHROPIC_VERTEX_PROJECT_ID=adk-coding-agents CLOUD_ML_REGION=global PYTHONDONTWRITEBYTECODE=1 +export GEPA_JUNIOR_RUBRIC=1 GEPA_VIZ_USAGE=1 GEPA_CHECK_LINKS=1 +export GEPA_JUDGE_PANEL=gemini-2.5-flash GEPA_GEN_SAMPLES=2 # lean optimize config under rate limits +export GEPA_RENDER_PORT=8961 GEPA_VIEWER_PORT=8960 # ISOLATE ports per concurrent run +exec python3 -u -m gepa_flowchart.multi_journey_run --group graph --run-dir overnight/graph1 +EOF +chmod +x "$CLAUDE_JOB_DIR/tmp/run_x.sh" +setsid "$CLAUDE_JOB_DIR/tmp/run_x.sh" >"$CLAUDE_JOB_DIR/tmp/run_x.log" 2>&1 & +``` + +**Detect a live run** (the naive proc grep false-matches watcher loops): +```bash +ps -e -o cmd | grep -E "python3 .*gepa_flowchart\.(ood_overexplain|multi_journey_run|journey_run|run)" | grep -v grep +``` + +## Gotchas (the expensive-to-rediscover list) + +- **`gepa_state.bin` is the source of truth**, not the printed log or `report.md`. If a + run dies, `recover` / `_recover_best_from_state` rebuilds the best candidate from it. +- **Self-repair loop is net-negative** — keep `GEPA_SELF_REPAIR` / `GEPA_REPAIR` off. + It adds a fix pass that costs more than it recovers. +- **Component vocab must match the viewer exactly.** Skills must only emit real Mantine + component names the renderer knows (e.g. there is **no `Stat`** component). An invented + name renders an error block and craters the board. +- **Isolate render/viewer ports per concurrent run** (`GEPA_RENDER_PORT`/`GEPA_VIEWER_PORT`; + e.g. 8899/8898, 8961/8960). Two runs on the same port corrupt each other's renders. +- **Vertex 429 RESOURCE_EXHAUSTED** is handled by `llm._retry` (exp backoff 5s→120s, + 7 attempts) then graceful empty-degrade. Under sustained limits, run the **lean config**: + single `gemini-2.5-flash` optimize judge + `K=2`. +- **`gemini-2.5-pro` returns all-zero rubric** (thinking eats the token budget) — keep it + OUT of the gate panel; the abstain logic tolerates a flaky judge but a silently-zero one + poisons the median. +- **Schema is where atoms live.** The er-diagram crater was a *schema gap* + (`_FLOW_SCHEMA` never documented `type:"entity"` + `fields[]`), not a prompt problem — + fix data/schema at the source, not with more prose. +- Ported wins ship via the **plugin** (`plugin/skills/diagram-recipes/SKILL.md` + + version bump), independent of the viewer image and npm CLI. GEPA validates a direction; + porting is a separate, guarded step (keep the "don't crowd visuals" space-budget guard). + +## Where things are + +- Metric: `unified_metric.py` (`score_unified`, `score_sampled`, `check_links`, + PoLL panel, junior/viz axes). Config + all `GEPA_*` env parsing: `config.py`. +- Data/questions: `dataset.py` (`freeze_questions`, junior rubric). Journeys: + `journeys.py` + `journeys_catalog.json`; topology skills: `skill_library.py` + (`SKILL_TEXT`, `_SEED_TEXT`, `topology_path`). +- Adapters: `journey_adapter.py`, `hierarchical_adapter.py`, `type_adapter.py`. +- Render bridge: `type_render.py` (`TypedRenderService`), `geometry_bridge.py`. +- LLM + retry: `llm.py`. Holdout set: `holdout_journeys.json`. +- Results write-ups: `overnight/AUTONOMOUS_SUMMARY.md` + `overnight/SUMMARY_*.md`. diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY.md b/scripts/experiments/gepa-flowchart/overnight/SUMMARY.md new file mode 100644 index 0000000..330f0fa --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/SUMMARY.md @@ -0,0 +1,10 @@ +# Overnight experiments — held-out comparison + +Canonical metric (WHM, weights 0.5/0.3/0.2), held-out topics (12, unseen during optimization): onboarding, state-machine, k8s-deploy, signup-funnel, saga-compensation, oauth-pkce, raft-election, k8s-scheduling, tcp-lifecycle, payment-3ds, blue-green, rate-limiter + +| candidate | score | comprehension | geometry | visual_quality | +|---|---|---|---|---| +| whm_hybrid_opus | **0.588** | 0.77 | 0.47 | 0.53 | +| linear_instance | **0.539** | 0.73 | 0.44 | 0.46 | +| whm_hybrid_sonnet | **0.490** | 0.66 | 0.39 | 0.48 | +| seed | **0.411** | 0.49 | 0.35 | 0.50 | diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY_ab_sgcr.json b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_ab_sgcr.json new file mode 100644 index 0000000..ecbfa0c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_ab_sgcr.json @@ -0,0 +1,178 @@ +[ + { + "id": "raft-election", + "both_ok": true, + "had_groups": true, + "nnodes": 14, + "dagre_geometry": 0.4, + "dagre_visual_quality": 1.0, + "dagre_comprehension": 0.5714, + "dagre_overlaps": 0, + "dagre_offscreen": 3, + "dagre_min_font": 10.2, + "dagre_total": 0.5479, + "dagre_ok": true, + "sgcr_geometry": 1.0, + "sgcr_visual_quality": 1.0, + "sgcr_comprehension": 0.7857, + "sgcr_overlaps": 0, + "sgcr_offscreen": 0, + "sgcr_min_font": 10.2, + "sgcr_total": 0.88, + "sgcr_ok": true + }, + { + "id": "oauth-pkce", + "both_ok": true, + "had_groups": true, + "nnodes": 8, + "dagre_geometry": 0.2, + "dagre_visual_quality": 0.65, + "dagre_comprehension": 0.3571, + "dagre_overlaps": 0, + "dagre_offscreen": 4, + "dagre_min_font": 10.2, + "dagre_total": 0.3118, + "dagre_ok": true, + "sgcr_geometry": 0.4, + "sgcr_visual_quality": 0.65, + "sgcr_comprehension": 0.5, + "sgcr_overlaps": 0, + "sgcr_offscreen": 3, + "sgcr_min_font": 10.2, + "sgcr_total": 0.486, + "sgcr_ok": true + }, + { + "id": "k8s-deploy", + "both_ok": true, + "had_groups": true, + "nnodes": 14, + "dagre_geometry": 0.4, + "dagre_visual_quality": 0.775, + "dagre_comprehension": 0.6429, + "dagre_overlaps": 0, + "dagre_offscreen": 3, + "dagre_min_font": 10.2, + "dagre_total": 0.56, + "dagre_ok": true, + "sgcr_geometry": 1.0, + "sgcr_visual_quality": 0.875, + "sgcr_comprehension": 0.7857, + "sgcr_overlaps": 0, + "sgcr_offscreen": 0, + "sgcr_min_font": 10.2, + "sgcr_total": 0.8584, + "sgcr_ok": true + }, + { + "id": "payment-3ds", + "both_ok": true, + "had_groups": true, + "nnodes": 14, + "dagre_geometry": 1.0, + "dagre_visual_quality": 1.0, + "dagre_comprehension": 0.7857, + "dagre_overlaps": 0, + "dagre_offscreen": 0, + "dagre_min_font": 10.2, + "dagre_total": 0.88, + "dagre_ok": true, + "sgcr_geometry": 1.0, + "sgcr_visual_quality": 1.0, + "sgcr_comprehension": 0.8571, + "sgcr_overlaps": 0, + "sgcr_offscreen": 0, + "sgcr_min_font": 10.2, + "sgcr_total": 0.9231, + "sgcr_ok": true + }, + { + "id": "saga-compensation", + "both_ok": true, + "had_groups": true, + "nnodes": 13, + "dagre_geometry": 0.2, + "dagre_visual_quality": 0.875, + "dagre_comprehension": 0.7143, + "dagre_overlaps": 0, + "dagre_offscreen": 4, + "dagre_min_font": 10.2, + "dagre_total": 0.4118, + "dagre_ok": true, + "sgcr_geometry": 0.0, + "sgcr_visual_quality": 0.25, + "sgcr_comprehension": 0.0, + "sgcr_overlaps": 0, + "sgcr_offscreen": 13, + "sgcr_min_font": 10.2, + "sgcr_total": 0.0595, + "sgcr_ok": true + }, + { + "id": "state-machine", + "both_ok": true, + "had_groups": true, + "nnodes": 12, + "dagre_geometry": 0.0, + "dagre_visual_quality": 0.65, + "dagre_comprehension": 0.5, + "dagre_overlaps": 0, + "dagre_offscreen": 5, + "dagre_min_font": 10.2, + "dagre_total": 0.1368, + "dagre_ok": true, + "sgcr_geometry": 1.0, + "sgcr_visual_quality": 1.0, + "sgcr_comprehension": 1.0, + "sgcr_overlaps": 0, + "sgcr_offscreen": 0, + "sgcr_min_font": 10.2, + "sgcr_total": 1.0, + "sgcr_ok": true + }, + { + "id": "tcp-lifecycle", + "both_ok": true, + "had_groups": true, + "nnodes": 15, + "dagre_geometry": 0.0, + "dagre_visual_quality": 0.775, + "dagre_comprehension": 0.5714, + "dagre_overlaps": 0, + "dagre_offscreen": 5, + "dagre_min_font": 10.2, + "dagre_total": 0.1402, + "dagre_ok": true, + "sgcr_geometry": 1.0, + "sgcr_visual_quality": 0.875, + "sgcr_comprehension": 0.7143, + "sgcr_overlaps": 0, + "sgcr_offscreen": 0, + "sgcr_min_font": 10.2, + "sgcr_total": 0.814, + "sgcr_ok": true + }, + { + "id": "blue-green", + "both_ok": true, + "had_groups": true, + "nnodes": 14, + "dagre_geometry": 0.2, + "dagre_visual_quality": 0.875, + "dagre_comprehension": 0.8571, + "dagre_overlaps": 0, + "dagre_offscreen": 4, + "dagre_min_font": 10.2, + "dagre_total": 0.4325, + "dagre_ok": true, + "sgcr_geometry": 0.0, + "sgcr_visual_quality": 0.775, + "sgcr_comprehension": 1.0, + "sgcr_overlaps": 0, + "sgcr_offscreen": 8, + "sgcr_min_font": 10.2, + "sgcr_total": 0.148, + "sgcr_ok": true + } +] \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY_geomvq.md b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_geomvq.md new file mode 100644 index 0000000..bbafe71 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_geomvq.md @@ -0,0 +1,10 @@ +# Held-out comparison + +Score = WHM over **geometry+visual_quality** (weights comp 0.5/geom 0.3/vq 0.2, comp floor 0.65). All three axes shown for reference. Held-out topics (12, unseen during optimization): onboarding, state-machine, k8s-deploy, signup-funnel, saga-compensation, oauth-pkce, raft-election, k8s-scheduling, tcp-lifecycle, payment-3ds, blue-green, rate-limiter + +| candidate | score | comprehension | geometry | visual_quality | +|---|---|---|---|---| +| geomvq_balanced | **0.613** | 0.68 | 0.56 | 0.90 | +| geomvq_strict | **0.596** | 0.82 | 0.50 | 0.85 | +| seed | **0.573** | 0.78 | 0.49 | 0.87 | +| geomvq_geomheavy | **0.529** | 0.77 | 0.44 | 0.82 | diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY_journeys.md b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_journeys.md new file mode 100644 index 0000000..6459c6b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_journeys.md @@ -0,0 +1,102 @@ +# Journey optimization — seed → best (all journeys, from checkpoint) + +- **agent-trace**: Seed 0.6145 → Best 0.7571 (+0.1426) +- **agent-trace.log**: NO STATE +- **algorithm-walkthrough**: Seed 0.5258 → Best 0.6278 (+0.1020) +- **algorithm-walkthrough.log**: NO STATE +- **architecture-zones**: Seed 0.4002 → Best 0.4002 (+0.0000) +- **architecture-zones.log**: NO STATE +- **before-after**: Seed 0.7198 → Best 0.8756 (+0.1557) +- **before-after.log**: NO STATE +- **build-pipeline**: Seed 0.5642 → Best 0.6944 (+0.1302) +- **build-pipeline.log**: NO STATE +- **c4-architecture**: Seed 0.1444 → Best 0.1744 (+0.0300) +- **c4-architecture.log**: NO STATE +- **calendar-heatmap**: Seed 0.3231 → Best 0.5729 (+0.2498) +- **calendar-heatmap.log**: NO STATE +- **call-hierarchy**: Seed 0.7167 → Best 0.7167 (+0.0000) +- **call-hierarchy.log**: NO STATE +- **class-diagram**: Seed 0.6298 → Best 0.7146 (+0.0848) +- **class-diagram.log**: NO STATE +- **complexity-growth**: Seed 0.3905 → Best 0.3905 (+0.0000) +- **complexity-growth.log**: NO STATE +- **correlation-heatmap**: Seed 0.8379 → Best 0.8379 (+0.0000) +- **correlation-heatmap.log**: NO STATE +- **critical-path**: Seed 0.2059 → Best 0.2059 (+0.0000) +- **critical-path.log**: NO STATE +- **data-lineage**: Seed 0.2054 → Best 0.3386 (+0.1332) +- **data-lineage.log**: NO STATE +- **debug-snapshot**: Seed 0.6360 → Best 0.6498 (+0.0138) +- **debug-snapshot.log**: NO STATE +- **diy-project-plan**: Seed 0.3605 → Best 0.4569 (+0.0964) +- **diy-project-plan.log**: NO STATE +- **er-diagram**: Seed 0.4758 → Best 0.5143 (+0.0385) +- **er-diagram.log**: NO STATE +- **event-driven**: Seed 0.4616 → Best 0.5363 (+0.0747) +- **event-driven.log**: NO STATE +- **explainer**: Seed 0.4588 → Best 0.4588 (+0.0000) +- **explainer.log**: NO STATE +- **flame-graph**: Seed 0.7178 → Best 0.7178 (+0.0000) +- **flame-graph.log**: NO STATE +- **flight-comparison**: Seed 0.8681 → Best 0.9226 (+0.0545) +- **flight-comparison.log**: NO STATE +- **gantt**: Seed 0.3085 → Best 0.4435 (+0.1350) +- **gantt.log**: NO STATE +- **growth-funnel**: Seed 0.4108 → Best 0.4975 (+0.0867) +- **growth-funnel.log**: NO STATE +- **k8s-topology**: Seed 0.3541 → Best 0.4394 (+0.0853) +- **k8s-topology.log**: NO STATE +- **map-routes**: Seed 0.2908 → Best 0.4962 (+0.2054) +- **map-routes.log**: NO STATE +- **metrics-dashboard**: Seed 0.6848 → Best 0.7712 (+0.0865) +- **metrics-dashboard.log**: NO STATE +- **observability-dashboard**: Seed 0.3656 → Best 0.7856 (+0.4200) +- **observability-dashboard.log**: NO STATE +- **okr-tree**: Seed 0.4068 → Best 0.4567 (+0.0499) +- **okr-tree.log**: NO STATE +- **pr-review**: Seed 0.4987 → Best 0.4987 (+0.0000) +- **pr-review-summary**: Seed 0.5800 → Best 0.6134 (+0.0333) +- **pr-review-summary.log**: NO STATE +- **pr-review.log**: NO STATE +- **product-comparison**: Seed 0.8179 → Best 0.8179 (+0.0000) +- **product-comparison.log**: NO STATE +- **query-plan**: Seed 0.5126 → Best 0.5613 (+0.0487) +- **query-plan.log**: NO STATE +- **raci-matrix**: Seed 0.7144 → Best 0.8222 (+0.1079) +- **raci-matrix.log**: NO STATE +- **recipe-display**: Seed 0.4915 → Best 0.5697 (+0.0781) +- **recipe-display.log**: NO STATE +- **recursion-tree**: Seed 0.5360 → Best 0.6896 (+0.1536) +- **recursion-tree.log**: NO STATE +- **risk-matrix**: Seed 0.6101 → Best 0.8474 (+0.2373) +- **risk-matrix.log**: NO STATE +- **roadmap**: Seed 0.6590 → Best 0.6675 (+0.0085) +- **roadmap.log**: NO STATE +- **security-threat-model**: Seed 0.1452 → Best 0.1474 (+0.0023) +- **security-threat-model.log**: NO STATE +- **sequence-diagram**: Seed 0.2548 → Best 0.5056 (+0.2508) +- **sequence-diagram.log**: NO STATE +- **service-health**: Seed 0.5727 → Best 0.5727 (+0.0000) +- **service-health.log**: NO STATE +- **shopping-comparison**: Seed 0.8007 → Best 0.8007 (+0.0000) +- **shopping-comparison.log**: NO STATE +- **sprint-burndown**: Seed 0.6448 → Best 0.6448 (+0.0000) +- **sprint-burndown.log**: NO STATE +- **sre-incident**: Seed 0.8680 → Best 0.9531 (+0.0851) +- **sre-incident.log**: NO STATE +- **stacktrace**: Seed 0.7551 → Best 0.8537 (+0.0985) +- **stacktrace.log**: NO STATE +- **state-machine**: Seed 0.2416 → Best 0.6187 (+0.3771) +- **state-machine.log**: NO STATE +- **swimlane-process**: Seed 0.1328 → Best 0.2273 (+0.0945) +- **swimlane-process.log**: NO STATE +- **task-checklist**: Seed 0.6107 → Best 0.7281 (+0.1174) +- **task-checklist.log**: NO STATE +- **task-progress**: Seed 0.5791 → Best 0.6727 (+0.0937) +- **task-progress.log**: NO STATE +- **task-tracker**: Seed 0.6364 → Best 0.6364 (+0.0000) +- **task-tracker.log**: NO STATE +- **trace-waterfall**: Seed 0.6863 → Best 0.6863 (+0.0000) +- **trace-waterfall.log**: NO STATE +- **user-journey**: Seed 0.3634 → Best 0.5671 (+0.2037) +- **user-journey.log**: NO STATE diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY_phase2.md b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_phase2.md new file mode 100644 index 0000000..7a7c078 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_phase2.md @@ -0,0 +1,6 @@ +# Phase 2 — panes-fixed + journeys (seed → best, from checkpoint) + +- **panes-fixed**: Seed 0.0000 → Best 0.0000 (+0.0000) +- **shopping-comparison**: Seed 0.8786 → Best 0.9000 (+0.0214) +- **recipe-display**: Seed 0.5429 → Best 1.0000 (+0.4571) +- **flight-comparison**: Seed 0.9000 → Best 1.0000 (+0.1000) diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY_repair.md b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_repair.md new file mode 100644 index 0000000..638e1d5 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_repair.md @@ -0,0 +1,9 @@ +# Held-out comparison + +Score = WHM over **comprehension+geometry+visual_quality** (weights comp 0.5/geom 0.3/vq 0.2). All three axes shown for reference. Held-out topics (12, unseen during optimization): onboarding, state-machine, k8s-deploy, signup-funnel, saga-compensation, oauth-pkce, raft-election, k8s-scheduling, tcp-lifecycle, payment-3ds, blue-green, rate-limiter + +| candidate | score | comprehension | geometry | visual_quality | +|---|---|---|---|---| +| repair_loop | **0.626** | 0.71 | 0.49 | 0.77 | +| repair_floor | **0.608** | 0.66 | 0.54 | 0.79 | +| seed | **0.605** | 0.69 | 0.48 | 0.74 | diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY_types.md b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_types.md new file mode 100644 index 0000000..8a8feb6 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/SUMMARY_types.md @@ -0,0 +1,6 @@ +# Non-flow type optimization — seed → best (from checkpoint) + +- **calltree**: **Seed 0.7029 → Best 0.7553 (+0.0525)** +- **vegalite**: **Seed 0.3871 → Best 0.4322 (+0.0452)** +- **component**: **Seed 0.4363 → Best 0.4840 (+0.0476)** +- **panes**: **Seed 0.0000 → Best 0.0000 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/combined2/report.md b/scripts/experiments/gepa-flowchart/overnight/combined2/report.md new file mode 100644 index 0000000..e06a47f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/combined2/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: er-diagram, class-diagram, state-machine, build-pipeline, shopping-comparison, product-comparison, metrics-dashboard, observability-dashboard, recipe, explainer, app-screen-mockup, training-curves +shared skills: artifact_note, board_layout, chart_internal, comparison_grid, dashboard_grid, graph_entity_lanes, graph_process_spine, report_rows, screen_frame +shared skills CHANGED by GEPA: board_layout + +**Seed 0.5019 -> Best 0.6658 (+0.1639)** diff --git a/scripts/experiments/gepa-flowchart/overnight/combined_overexplain/report.md b/scripts/experiments/gepa-flowchart/overnight/combined_overexplain/report.md new file mode 100644 index 0000000..5a226a3 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/combined_overexplain/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: k8s-topology, cloud-architecture, product-comparison, shopping-comparison, recipe, metrics-dashboard +shared skills: artifact_note, board_layout, comparison_grid, dashboard_grid, graph_zoned_tiers, report_rows +shared skills CHANGED by GEPA: board_layout + +**Seed 0.4546 -> Best 0.5489 (+0.0943)** diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_home-budget/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_home-budget/best_prompts.json new file mode 100644 index 0000000..a6ab298 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_home-budget/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'home-budget' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- A chart of spending by category\n- Income vs total spending (or savings) is shown\n- The money axis has a title and currency units\n- Categories sorted by amount or shown as proportions\n- Category values or percentages are labeled\n- The largest category is highlighted\n- Title names the household and the month\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_home-budget/report.md b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_home-budget/report.md new file mode 100644 index 0000000..f12b883 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_home-budget/report.md @@ -0,0 +1,5 @@ +# GEPA journey: home-budget + +rubric criteria: 7; val: 2 + +**Seed 0.6179 → Best 0.6179 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_meal-plan/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_meal-plan/best_prompts.json new file mode 100644 index 0000000..76de317 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_meal-plan/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'meal-plan' board as concise plain text.\n\nINPUTS\n- Use case: {topic}\n- Reader: {audience}\n- Goal: {purpose}\n\nTASK\nProduce a meal-plan board that names SPECIFIC, real content (actual dish names, real\nquantities, concrete labels). Generic placeholders fail. The board is rendered in a\nLIMITED-SIZE space and scored on comprehension, visual quality, geometry (no overflow/\nclipping), and a strict rubric. You MUST satisfy EVERY rubric criterion below AND keep\nthe output compact so nothing overflows or clips.\n\nRUBRIC \u2014 every item is mandatory:\n1. Title names the plan/week AND the diet (e.g. \"Vegetarian Meal Plan \u2014 Week of Jun 9\u201315 (Lacto-Ovo)\").\n2. A grid of 7 days \u00d7 3 meals (breakfast / lunch / dinner) where every slot names a SPECIFIC dish.\n3. A consolidated grocery list with a checkbox for EACH item (see strict format below).\n4. Variety + nutritional balance across the week, stated explicitly in a short notes line.\n5. Prep and/or leftover notes are included.\n6. Dietary constraints are reflected or tagged (use short inline tags).\n7. Concrete real values throughout (quantities on groceries, specific dish names).\n\nCRITICAL FORMAT RULES (these fix the recurring scoring failures):\n\nA. GROCERY LIST \u2014 this is the most failure-prone criterion (it scored 0.0 repeatedly).\n - Put ONE item per line, each starting with \"- [ ] \".\n - NEVER combine multiple foods on one checkbox line. Do NOT use \"\u00b7\", \"/\", or commas\n to pack several items behind a single checkbox. \"- [ ] Quinoa \u00b7 Rice \u00b7 Oats\" is WRONG.\n Write three lines instead: \"- [ ] Quinoa\", \"- [ ] Brown rice\", \"- [ ] Oats\".\n - Include a quantity for each item (e.g. \"- [ ] Eggs (18)\", \"- [ ] Salmon fillets (1.5 lb)\").\n - Group under plain headers (Produce, Protein, Dairy, Pantry) but keep each item on its\n own checkbox line under the header.\n - Keep the list to roughly 18\u201328 lines so it fits; consolidate duplicate ingredients.\n\nB. GEOMETRY \u2014 content overflowed/clipped in every prior attempt. Stay compact:\n - Do NOT use wide markdown tables; they overflow. Render the day grid as one block per\n day with short dish names, e.g.:\n Mon \u2014 B: Greek yogurt parfait | L: Chickpea spinach wrap | D: Lentil bolognese [veg]\n Tue \u2014 B: ... | L: ... | D: ...\n - Keep each dish name short (2\u20135 words). No long descriptions inside slots.\n - Keep balance notes and prep notes to a few short bullet lines, not paragraphs.\n - Avoid decorative separators, emoji walls, and redundant sections. Total output should\n be tight enough to render without scrolling/clipping.\n\nC. CLARITY \u2014 use clear short section headers in this order:\n 1) Title line\n 2) GRID (7 day-lines as above)\n 3) BALANCE: one line summarizing protein rotation, whole grains, veg/day, variety\n 4) PREP/LEFTOVERS: 3\u20135 short bullets\n 5) GROCERY LIST: header-grouped, one checkbox per item\n\nDIETARY TAGS\n- Use short inline tags in dinner/meal slots, e.g. [veg], [fish], [leftover], [vegan-swap].\n- Add one line noting how dietary constraints from the use case are met (and any swaps).\n\nQUALITY CHECKLIST before finishing:\n- Every grid slot has a specific dish.\n- Each grocery item is a separate \"- [ ] \" line with a quantity.\n- No table, no overlong lines, nothing that would overflow.\n- Title includes week + diet.\n- Prep/leftover notes present; balance stated; constraints tagged.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_meal-plan/report.md b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_meal-plan/report.md new file mode 100644 index 0000000..5057dbd --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_meal-plan/report.md @@ -0,0 +1,5 @@ +# GEPA journey: meal-plan + +rubric criteria: 7; val: 2 + +**Seed 0.4528 → Best 0.5279 (+0.0751)** diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_recipe/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_recipe/best_prompts.json new file mode 100644 index 0000000..d2c533a --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_recipe/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a single physical 'recipe' board (a fixed-size poster/card).\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning ONE fixed-size physical recipe board. It is judged on comprehension, visual quality, geometry (content MUST fit without overflowing or clipping), and a strict rubric. Aim to satisfy EVERY rubric criterion with SPECIFIC, real, concrete content (generic plans fail).\n\nRUBRIC CRITERIA \u2014 name concrete content for each, and LABEL each element inline with its rubric number (e.g. \"[Rubric 1: TITLE]\"):\n1. TITLE that names the specific real dish; include the local/authentic name in parentheses if relevant (e.g. \"Spaghetti Carbonara (Spaghetti alla Carbonara)\"). Add a short subtitle tying to the reader/goal.\n2. VISUAL/PHOTO \u2014 ONE concrete hero photo of the finished dish as its own dedicated, clearly-labeled image element (\"HERO PHOTO\"). Describe what's in the bowl/plate, garnish, plating, props, lighting. Never omit or bury it.\n3. INGREDIENTS LIST with exact quantities in BOTH metric and US units where helpful (e.g. \"400 g (14 oz) spaghetti\", \"\u2153 cup (75 g) melted butter\").\n4. NUMBERED PREPARATION STEPS in correct order, using real techniques, temps (\u00b0C and \u00b0F), and times.\n5. PREP TIME + COOK TIME + TOTAL TIME + SERVINGS, all with real values.\n6. DIFFICULTY / SKILL LEVEL stated explicitly (e.g. \"Beginner / Easy\", \"Intermediate\").\n7. BOTH a per-serving nutrition line (calories, protein, carbs, fat) AND 2\u20133 specific chef tips. Include both; do not drop one.\n\nCRITICAL \u2014 GEOMETRY / FIT THE BOARD (this fails most often when content is too long):\n- Ingredients: cap at ~9 lines (do NOT exceed ~9\u201310; 11+ lines overflows). Group only if needed; no long sub-sections.\n- Steps: cap at ~8 numbered steps (max 9); keep each to ONE short line.\n- Chef tips: max 3 short bullets.\n- Nutrition: a single line, not a block.\n- Quick-facts: one tight horizontal strip/row of badges for prep/cook/total/servings/difficulty.\n- Balance the two columns: ingredient count \u2248 step count so neither column overflows.\n\nLAYOUT \u2014 state an explicit, realistic fixed layout in clear top-to-bottom zones that physically fits:\n- ZONE 1 \u2014 TITLE BAR (top, full width): title + short subtitle.\n- ZONE 2 \u2014 HERO PHOTO (full-width banner below title).\n- ZONE 3 \u2014 QUICK-FACTS STRIP (one slim horizontal row of badges: prep / cook / total / serves / difficulty).\n- ZONE 4 \u2014 TWO COLUMNS (main body): INGREDIENTS left | STEPS right.\n- ZONE 5 \u2014 BOTTOM STRIP (slim, full width): nutrition single line + 3 chef tips.\nSpecify a concrete board size and orientation (e.g. \"A2 portrait poster\" or \"18\"\u00d724\" vertical\"), and approximate vertical proportions for each zone.\n\nCOMPREHENSION \u2014 to maximize clarity and scoring:\n- Use real, authentic, specific values throughout (real dish name, real measurements, real temps/times, real per-serving nutrition).\n- Label every element inline with its rubric number.\n- End with a brief FIT CHECK confirming counts (e.g. \"9 ingredient lines vs 8 step lines, balanced; 1 nutrition line + 3 tips; no sub-sections, no overflow\") and zone proportions.\n\nOutput a concise plain-text plan. Be concrete with real values; keep it short enough to fit the board.", + "generate": "You generate a single valid JSON object describing a Mantine-style UI component tree that renders a visual \"board\" (e.g., a recipe poster/card). The JSON is rendered into a fixed-size layout and scored on: composition/use of space (comp), visual quality (vq), geometry (must NOT overflow or clip), and rubric coverage.\n\n{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\n## OUTPUT FORMAT\n- Output ONLY the JSON object. No prose, no markdown fences, no comments.\n- The root must be a valid component node of the declared type. Each node is:\n {\"type\": \"\", \"props\": {...}, \"children\": [...] or \"\"}\n- Use real, concrete values from the plan. Never use placeholders like \"TBD\", \"...\", or lorem text.\n- Escape inner quotes properly (e.g., 9\u00d75\\\" loaf pan). Ensure the JSON parses.\n\n## AVAILABLE COMPONENTS (use these)\nCard, Stack, Group, Grid, Grid.Col, SimpleGrid, Title, Text, Image, Badge, Divider, List, List.Item.\n- Title uses props.order (1=largest). Text uses size (xs/sm/md/lg) and c (e.g., \"dimmed\"), fw for weight.\n- List uses props.type:\"ordered\" for numbered steps; List.Item children are strings.\n- Grid.Col uses props.span (out of 12). SimpleGrid uses props.cols.\n- Image needs src, alt, and a height (h).\n\n## TASK REQUIREMENTS (bake in the SPECIFIC values the plan names)\nSatisfy EVERY rubric criterion explicitly. For recipe boards these are:\n1. Title (and subtitle if planned).\n2. A hero/visual image element with descriptive alt text.\n3. Ingredients list (exact lines from the plan).\n4. Numbered preparation steps (ordered list, exact steps).\n5. Quick-facts (prep/cook/total time, servings) as badges.\n6. Difficulty, included as a distinct labeled badge.\n7. Nutrition line AND chef tips \u2014 BOTH must be present and complete.\n\n## CRITICAL FIXES (these were the recurring failures \u2014 do not repeat them)\n\n### 1. GEOMETRY: NEVER OVERFLOW OR CLIP (top priority)\nContent overflowed/clipped in multiple cases. The board has fixed dimensions, so be compact:\n- Keep the whole tree tight. Use small gaps (gap: \"xs\"/\"sm\" or numeric 4\u20138) and small padding (\"sm\"/\"md\", not \"lg\").\n- Use size \"sm\" for List items and body Text; reserve \"lg\" only for the single nutrition value.\n- Put long lists (ingredients/steps) side-by-side in two columns (SimpleGrid cols:2 or Grid with span 6/6) to reduce height.\n- Keep the hero Image height moderate (h \u2248 140\u2013220), not 260\u2013300, so it does not push content past the boundary.\n- Do not wrap every section in its own bordered Card stacked vertically \u2014 nested/stacked cards add height and cause clipping. Prefer Dividers between sections within one container, or compact cards.\n- Mentally do a fit check: title + image + one badge strip + two-column body + one bottom strip should fit without scrolling.\n\n### 2. IMAGE / VISUAL QUALITY (image_or_visual was scored low at 0.5)\nThe hero image must look like a real, relevant photo \u2014 not a gray placeholder:\n- Prefer a realistic photo URL (e.g., an Unsplash-style image URL) over placehold.co text placeholders, which score poorly on visual quality.\n- Always include rich, specific alt text describing the finished dish.\n- Include the planned caption as a small dimmed Text directly under the image.\n- Make the image visually prominent (top band), but not so tall it overflows.\n\n### 3. NUTRITION + TIPS (nutrition_or_tips was scored as low as 0.0)\nBoth nutrition AND chef tips must be clearly present and fully populated:\n- Nutrition: show the complete line with all values from the plan (e.g., \"215 kcal \u00b7 3 g protein \u00b7 34 g carbs \u00b7 8 g fat\"), labeled \"Nutrition (per serving/slice)\".\n- Render nutrition as readable text/values, not as cryptic standalone badges that can be mistaken for facts.\n- Chef Tips: a clearly labeled heading plus the FULL list of tips from the plan (all of them, verbatim in meaning).\n- Do not drop, truncate, or summarize tips. Missing or incomplete tips is the single biggest scoring loss here.\n\n### 4. COMPOSITION / USE OF SPACE (comp was ~0.64\u20130.79)\n- Fill the board's width: use full-width title band, a badge strip that uses the row, and balanced two-column body.\n- Vary color across badges meaningfully (e.g., time badges one family, servings another, difficulty a standout color).\n- Maintain clear visual hierarchy: Title (order 1) > section headings (order 3/4) > body text.\n- Use Dividers to separate the title band, facts strip, body, and bottom strip for a clean poster look.\n\n## PROCESS\n1. Map each plan zone to rubric criteria; confirm all 7 are covered.\n2. Choose a compact two-column body layout to control height.\n3. Use a realistic image URL + descriptive alt + caption.\n4. Ensure nutrition line and ALL chef tips are fully written out.\n5. Re-check the geometry fit (compact sizes/gaps) before finalizing.\n6. Output ONLY the valid JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_recipe/report.md b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_recipe/report.md new file mode 100644 index 0000000..8a892c7 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_recipe/report.md @@ -0,0 +1,5 @@ +# GEPA journey: recipe + +rubric criteria: 7; val: 2 + +**Seed 0.4607 → Best 0.9329 (+0.4722)** diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_travel-itinerary/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_travel-itinerary/best_prompts.json new file mode 100644 index 0000000..611e92c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_travel-itinerary/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'travel-itinerary' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each day of the trip is its own clearly labeled section/pane\n- Activities have times/order within each day\n- A map pane shows the destinations or route\n- A budget/cost element (chart or table) is included\n- Transport and accommodation logistics are noted\n- Must-see highlights are flagged\n- Title names the trip (destination + duration)\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_travel-itinerary/report.md b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_travel-itinerary/report.md new file mode 100644 index 0000000..1da1bc3 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_travel-itinerary/report.md @@ -0,0 +1,5 @@ +# GEPA journey: travel-itinerary + +rubric criteria: 7; val: 2 + +**Seed 0.4479 → Best 0.4479 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_trip-comparison/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_trip-comparison/best_prompts.json new file mode 100644 index 0000000..3ba3208 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_trip-comparison/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'trip-comparison' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each destination/option is its own card\n- Each option shows a representative image\n- Each option shows an estimated cost\n- Weather or best-season info per option\n- Key highlights/activities per option\n- A recommended/best-value option is flagged\n- Each option has an outbound link to book or learn more\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_trip-comparison/report.md b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_trip-comparison/report.md new file mode 100644 index 0000000..0417cfb --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_trip-comparison/report.md @@ -0,0 +1,5 @@ +# GEPA journey: trip-comparison + +rubric criteria: 7; val: 2 + +**Seed 0.4100 → Best 0.4100 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_weekend-plan/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_weekend-plan/best_prompts.json new file mode 100644 index 0000000..d756119 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_weekend-plan/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'weekend-plan' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Saturday and Sunday are clearly separated\n- Activities are placed in time blocks (morning/afternoon/evening or hours)\n- A mix of activity types (chores, leisure, social) is shown\n- A checklist of to-dos with checkboxes is included\n- Priorities or must-dos are flagged\n- Realistic buffers, meals and rest are included\n- Title names the weekend\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/everyday_journey_weekend-plan/report.md b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_weekend-plan/report.md new file mode 100644 index 0000000..b83635b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/everyday_journey_weekend-plan/report.md @@ -0,0 +1,5 @@ +# GEPA journey: weekend-plan + +rubric criteria: 7; val: 2 + +**Seed 0.3165 → Best 0.3165 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-component/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-component/best_prompts.json new file mode 100644 index 0000000..8b4e622 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-component/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'generic-component' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Information is organized with a clear visual hierarchy (headings/sections)\n- Uses appropriate UI components for the data (cards/tables/lists/badges)\n- Sections are titled and scannable\n- Covers the requested information completely with concrete details\n- Related items are visually grouped\n- Dense but legible \u2014 good use of space, neither cramped nor empty\n- Key items/values are emphasized (badges, color)\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-component/report.md b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-component/report.md new file mode 100644 index 0000000..5c90574 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-component/report.md @@ -0,0 +1,5 @@ +# GEPA journey: generic-component + +rubric criteria: 7; val: 2 + +**Seed 0.5522 → Best 0.5522 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-flow/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-flow/best_prompts.json new file mode 100644 index 0000000..f8c2293 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-flow/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a \"generic-flow\" diagram board. You will be given:\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nProduce a concise plain-text plan with SPECIFIC, real content (actual node labels, actual edge labels, actual structure). Generic plans fail.\n\nThe board is scored on: comprehension, visual_quality, geometry (on-canvas, no overlaps), and a rubric. PAST ATTEMPTS SCORED POORLY. Follow these hard-won rules exactly:\n\n== SIZE / GRANULARITY (critical \u2014 overcrowding caused off-canvas failures and 0.0 coverage) ==\n- Use 5\u20138 nodes TOTAL. Never exceed 8. Fewer, well-chosen nodes score far better than a long chain.\n- If the topic is a long linear sequence (e.g. a 12-step supply chain), COMPRESS adjacent steps into grouped nodes (e.g. \"Processing (pulp, dry, mill)\") rather than one node per step. Aim for the smallest set that still covers the topic.\n- Map node count to the named stages in the use case. If the use case explicitly lists N stages (e.g. \"evaporation, condensation, precipitation, collection\"), make those the backbone and add at most 1\u20132 context nodes.\n\n== LABELED EDGES (this criterion scored 0.0 in EVERY past attempt \u2014 fix it) ==\n- EVERY edge MUST have a short, explicit label, including obvious-looking sequential ones. Do not leave any edge unlabeled.\n- Present edges in a clean, uniform machine-readable list, one per line, in this exact format:\n EDGE: -> : \"\"\n- Edge labels should be 1\u20134 words describing the relationship/action (e.g. \"heats\", \"produces\", \"fixed by RuBisCO\", \"roasted ~12 min\").\n- Do not use dotted/containment edges or decorative edges; only real labeled connections.\n\n== COLOR CODING (scored 0.5 \u2014 too many categories) ==\n- Use 2\u20134 color roles maximum (not 5). Each role must map to a clear node type.\n- Provide a legend block mapping each color to its role.\n- Every node lists its color.\n\n== LAYOUT / GEOMETRY (Example 3 had 11 nodes off-canvas) ==\n- Choose ONE consistent direction: top-down OR left-right, whichever fits the content; state it explicitly.\n- Keep the diagram compact so ALL nodes fit on a single canvas. With \u22648 nodes and one consistent flow this is automatic \u2014 do not spread nodes into corners or wide loops that push content off-canvas.\n- Lay out as evenly spaced rows (top-down) or columns (left-right). Place the legend in a corner zone clear of the flow.\n- Ensure no node overlaps and no edge passes through an unrelated node: keep the main flow on one spine; route any branch/return edge along the outside.\n\n== COVERAGE ==\n- Cover every stage/element named in the use case with no gaps, using the compressed node set.\n- Tie the flow to the reader's goal: the path from first to last node should let the {audience} achieve {purpose}.\n\n== OUTPUT FORMAT ==\nOutput these sections, concisely:\n1. Layout Direction (one line: top-down or left-right + why)\n2. Color Legend (2\u20134 roles)\n3. Nodes (numbered list; each: label + color + position row/column)\n4. Edges (the EDGE: lines, every edge labeled)\n5. Coverage Check (one or two lines confirming all named stages are present and the goal is met)\n\nKeep total length tight. Be concrete with real values everywhere.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-flow/report.md b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-flow/report.md new file mode 100644 index 0000000..61af985 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-flow/report.md @@ -0,0 +1,5 @@ +# GEPA journey: generic-flow + +rubric criteria: 7; val: 2 + +**Seed 0.3998 → Best 0.4310 (+0.0313)** diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-panes/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-panes/best_prompts.json new file mode 100644 index 0000000..c00921b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-panes/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'generic-panes' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nWHAT A 'GENERIC-PANES' BOARD IS\nA board is a single fixed-size canvas divided into a small grid of rectangular panes. Each pane holds ONE visual element (chart, table, list, KPI tiles, etc.). The whole thing is rendered, so content that is too dense will visually OVERFLOW or CLIP its pane \u2014 this is the single most common failure. Plan for SPACE, not just content.\n\nCRITICAL LESSON FROM PAST ATTEMPTS (these scored 0.22\u20130.56):\nThe biggest losses came from (1) geometry overflow/clipping, (2) missing/unclear overall title rendering, (3) incomplete story, (4) unbalanced panes. Fix all four explicitly.\n\nHARD RULES TO FOLLOW\n\n1. KEEP THE GRID SMALL AND BALANCED\n - Prefer a 2\u00d72 grid (4 panes). Use 2\u00d73 (6 panes) ONLY if the story genuinely needs it AND each pane stays sparse.\n - Do NOT exceed 6 panes. Fewer, fuller-but-not-overflowing panes beat many cramped ones.\n\n2. STRICTLY LIMIT CONTENT PER PANE (this prevents overflow \u2014 the #1 killer)\n - Bar/line chart: MAX 5\u20137 data points/bars. No more.\n - Table: MAX 4 rows \u00d7 3 columns. Keep cell text to a few words.\n - List / ranked cards: MAX 4\u20135 short items, one line each (\u2264 ~6 words per line).\n - KPI / stat tiles: MAX 3\u20134 big numbers. Not 6.\n - Progress bars: MAX 4 bars.\n - Profile cards: MAX 4 cards, one short line each.\n - Each pane should have roughly EQUAL visual weight \u2014 no pane stuffed with 6 metrics while another has 2. Aim for 3\u20135 items everywhere.\n - Drop long subtitles, footnotes, and \"tip\" lines if they risk crowding. One short subtitle max per pane.\n\n3. EVERY PANE MUST HAVE: a short title + an explicit pane TYPE + concrete real values.\n - Pick the RIGHT type for the content: trends \u2192 line chart; category comparison \u2192 bar chart; current status/summary numbers \u2192 KPI tiles; structured records \u2192 small table; ranked items \u2192 list/cards; sequence over time \u2192 timeline; part-to-whole \u2192 progress bars or nested boxes.\n\n4. OVERALL TITLE \u2014 MAKE IT UNMISSABLE\n - State a single clear overall board title as a dedicated top banner, labeled explicitly as \"Overall Title\".\n - It must frame the whole board (name the subject + scope/timeframe). Keep it one line. This was scored 0.0 several times \u2014 do not bury it or split it.\n\n5. COHERENT LAYOUT + CLEAR READING ORDER\n - State the layout shape (e.g., \"2\u00d72 grid\") and the exact reading order (top-left \u2192 top-right \u2192 bottom-left \u2192 bottom-right).\n - Group panes so the arrangement is logical (e.g., \"top row = current state, bottom row = trajectory\"). Make the grouping explicit.\n\n6. COMPLETE STORY\n - The panes together must fully answer the reader's goal \u2014 no obvious gap. Briefly justify, at the end, how the set of panes covers the whole question the reader is asking.\n - Avoid redundancy (two panes showing the same thing) and avoid missing a key dimension.\n\nOUTPUT FORMAT (concise plain text)\n- Line: Overall Title: \"\"\n- Line: Layout: + reading order + row/column grouping logic\n- For each pane: \"Pane N (position) \u2014 \u2014 Type: <type>\" followed by the SPECIFIC content (real labels and values), respecting the per-type item caps above.\n- Final 1\u20132 lines: Story arc \u2014 how the panes together completely satisfy the goal.\n\nBe concrete with real values and labels (generic plans fail), but stay SPARSE so nothing overflows. When in doubt, cut content rather than add it.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-panes/report.md b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-panes/report.md new file mode 100644 index 0000000..d46e3fc --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-panes/report.md @@ -0,0 +1,5 @@ +# GEPA journey: generic-panes + +rubric criteria: 7; val: 2 + +**Seed 0.5637 → Best 0.6275 (+0.0638)** diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-vegalite/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-vegalite/best_prompts.json new file mode 100644 index 0000000..1119a82 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-vegalite/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'generic-vegalite' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- The chart type fits the data and the question being asked\n- Title, subtitle and axis titles are all present\n- Scales and units are readable and appropriate\n- A legend is present when multiple series/categories exist\n- The chart makes the key insight visually obvious\n- Tooltips or labels give additional context\n- Not overcrowded; fits the canvas cleanly\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-vegalite/report.md b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-vegalite/report.md new file mode 100644 index 0000000..9b7cba0 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/generic_journey_generic-vegalite/report.md @@ -0,0 +1,5 @@ +# GEPA journey: generic-vegalite + +rubric criteria: 7; val: 2 + +**Seed 0.7487 → Best 0.7487 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/geomvq_balanced/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/geomvq_balanced/best_prompts.json new file mode 100644 index 0000000..0e6a684 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/geomvq_balanced/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a downstream auto-layout diagram generator (React Flow-style). You produce a PLAIN-TEXT PLAN, not a diagram. The plan is judged on TWO axes that BOTH must score well:\n(1) COMPREHENSION \u2014 a newcomer can answer specific factual questions (triggers, thresholds, ordering, actors, mappings, failure paths) from the rendered diagram, with a hard floor at 0.65 (below it the whole score is scaled down).\n(2) VISUAL RENDERING \u2014 the diagram fits on one screen: no off-canvas nodes, no overlapping node pairs, and clear edge labels.\n\nInputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\n============================================================\n=== WHAT ACTUALLY CAUSES LOW SCORES (learned from real renders) ===\n============================================================\nPast plans had \"clean\" geometry descriptions in the text yet STILL rendered with 4\u20136 nodes off-canvas and 1\u20134 overlapping node pairs. The downstream engine's layout is driven by (a) how MANY nodes you create and (b) how LONG each node's text is. Therefore:\n\nA. NODE COUNT IS THE #1 RISK. Even 13\u201314 nodes pushed nodes off-canvas. TARGET 10\u201313 TOTAL nodes across the whole plan. Hard ceiling 13. Fewer, well-rendered nodes beat a complete-but-clipped plan.\n\nB. LONG NODE LABELS GET CLIPPED OR OVERLAP. Every failed `node_on_canvas`/`node_no_overlap`/`label_legibility` finding was a node stuffed with a full sentence (e.g., \"REJECTED: status=Rejected; recollect request sent to provider\"). RULE: every node label is a SHORT NOUN-PHRASE, \u2264 6 words, \u2264 ~40 characters. NO commas chaining clauses, NO \"\u2192\" inside a node, NO embedded thresholds/actors/policies inside a node. All of that detail goes in the LEGEND (it doesn't cost nodes and isn't clipped).\n\nC. EDGE LABELS MUST BE 1\u20132 WORDS. Repeated `edge_clarity` failures came from branch labels. Use only terse labels: \"yes\"/\"no\", \"pass\"/\"fail\", \"ok\"/\"timeout\". Never put a phrase on an edge.\n\nD. FEWER DECISIONS = CLEARER EDGES. Each decision adds branch edges that the engine routes badly. Use AT MOST 2 decision nodes per pane.\n\n============================================================\n=== HARD RENDERING CONSTRAINTS ===\n============================================================\n1. TOTAL NODE BUDGET: 10\u201313 nodes summed over ALL panes. Count EVERY box (entry, step, decision, terminal). Re-count before finalizing; if over 13, cut.\n2. PER-PANE BUDGET: 4\u20136 nodes. PREFER 2 PANES. Never plan more than 2 panes unless the grand total still fits \u2264 13 (then 3 tiny panes of ~4). Two well-rendered panes outscore three clipped ones.\n3. TERMINAL STATES: 2\u20133 leaves per pane, each a short noun-phrase leaf with no outgoing edges.\n4. NO EDGE-OVER-NODE GEOMETRY. For any decision, route the MAIN/continue branch to the IMMEDIATELY-next node in reading order; route the SIDE/failure branch to a TERMINAL LEAF placed off to the side. Failure/retry/escalation branches END in a labeled terminal \u2014 never loop back past other nodes.\n5. NO EDGE SKIPS A NODE. Every edge connects adjacent nodes in reading order or goes to a leaf.\n6. PANES ARE INDEPENDENT. Each pane has its own entry and its own terminals. Inter-pane connection is a TEXT LABEL only (\"\u2192 continues to Pane B\"), never a drawn edge spanning panes.\n7. NO LONG-RANGE OR BACK-JUMPING EDGES.\n8. Suggest top-to-bottom layout, or let the engine choose. Do not over-specify coordinates.\n\n============================================================\n=== HOW TO KEEP COMPREHENSION HIGH WITHOUT BLOATING NODES ===\n============================================================\nComprehension is graded on whether a reader can answer questions about triggers, numeric thresholds, ordering, responsible actors, mappings, and failure handling. Since node text must stay short, PUT ALL OF THIS DETAIL IN THE LEGEND (section 2), where it is fully readable and costs no nodes. The legend is your comprehension engine.\n\nIn the LEGEND you MUST include, with CONCRETE REAL-WORLD DEFAULT VALUES (never \"a threshold\"/\"a grace window\"):\n- TERMS a newcomer needs, defined briefly.\n- ALL FIXED MAPPING TABLES inline (e.g., severity tiers, commit-prefix\u2192version bump, category\u2192queue, impact\u00d7urgency\u2192priority, forward-step\u2192compensating-action, analyte\u2192critical-value range). Give every row with numbers.\n- TRIGGERS WITH VALUES: concrete thresholds/durations/counts (e.g., \"CPU > 80% sustained 5 min\", \"3 retries, backoff 1s/2s/4s\", \"antibiotics within 60 min\", \"K+ < 2.8 or > 6.0 mmol/L\").\n- EVALUATING ACTOR for each decision: name WHO/WHAT decides (leader/quorum/CI/on-call engineer/RN/LIS) and whether AUTOMATED or HUMAN.\n- STATE/DATA: who creates/persists/validates it, where stored, where checked.\n- FAILURE/RECOVERY POLICY: for each fallible step \u2014 block / retry (count + backoff) / escalate (to whom, after what timeout) / rollback. Name the NOTIFICATION METHOD and WHO IS NOTIFIED (a past failure: \"recollect request sent to provider\" lost points for not naming the method). State whether a redo is automatic or must be re-ordered.\n- ORDERING: the explicit sequence of major operations, marking automated vs manual.\n\nThe flowchart NODES carry only the flow skeleton (short labels). The LEGEND carries the answers.\n\n============================================================\n=== OUTPUT FORMAT (plain text, labeled sections) ===\n============================================================\n1. Purpose & Scope (1\u20132 sentences).\n2. Context for Newcomers \u2014 LEGEND: terms + ALL fixed mappings/tables/thresholds/actors/failure-policies with concrete values. This is where comprehension detail lives. Be exhaustive here.\n3. Panes \u2014 name each pane, what it covers, and its node count. State GRAND TOTAL and confirm \u2264 13.\n4. For each pane:\n - Entry/Trigger (concrete condition; full detail can reference the legend).\n - Key Steps/States: ordered list. For EACH node give: the SHORT node label (\u22646 words, the exact text that will render) AND a one-line note of the underlying detail/actor (the detail stays out of the rendered node, but you record it so it's reflected in the legend).\n - Decisions & Branches (\u22642 per pane): for each \u2014 the condition (short), the evaluating actor (named, automated/human), and each branch's destination with a 1\u20132 WORD edge label. Main branch \u2192 immediately-next node; failure/timeout/retry/escalation branch \u2192 a side terminal leaf.\n - Terminal States: 2\u20133 short leaf labels.\n - Geometry confirmation: confirm node count, that main branches go to adjacent nodes, side branches go to side leaves, no edge skips a node, no cross-pane edge.\n5. Out of Scope \u2014 explicitly state what is excluded, INCLUDING any panes/steps you dropped to stay within the 13-node budget.\n\nBEFORE FINALIZING, verify ALL of:\n- Grand total nodes \u2264 13 (re-count every box).\n- Every node label is a short noun-phrase \u2264 6 words with no commas/arrows/embedded numbers.\n- Every numeric threshold, actor, mapping, and failure policy appears in the LEGEND.\n- Every edge label is 1\u20132 words.\n- \u2264 2 decisions per pane; \u2264 3 terminals per pane.\n- Every failure branch ends in a leaf; no edge skips a node; no drawn cross-pane edge.\n- Comprehension questions implied by the Purpose (triggers, ordering, actors, mappings, failure handling, notification method) are all answerable from the legend.\nIf any check fails, cut nodes or move detail to the legend until it passes. A smaller plan that fully renders and keeps its detail in the legend beats a complete plan that clips.", + "generate": "You generate a flowchart `flow` diagram as JSON that renders cleanly in a React Flow canvas and is readable as a SINGLE screenshot without panning or zooming.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Output ONLY the JSON (no prose, no code fences).\n\n=================================================================\nTOP PRIORITY: THE IMAGE MUST FIT ON SCREEN\nThis is the #1 score-killer. In nearly EVERY past run, even logically clean diagrams with only 11 nodes and 2 panes still rendered 4\u20135 nodes OFF the visible canvas. The auto-layout places the two panes side by side and stacks each pane's nodes vertically; tall columns and wide fan-outs get pushed off the canvas or clipped at the edges.\n\nWHAT THE DATA SHOWS:\n - An 11-node / 2-pane TB diagram with a \"clean\" geometry check STILL lost 3\u20135 nodes off-canvas. Clean geometry is NOT enough.\n - The single most reliable fix is FEWER NODES and SHORTER VERTICAL SPINES. The diagram that scored best (0.93 comprehension) still lost nodes because columns were too tall.\n - Therefore: be MORE aggressive about compactness than any prior attempt. Compactness beats completeness of any single label.\n\nHARD COMPACTNESS TARGETS (stricter than before):\n - TOTAL NODES: aim for 8\u201310. NEVER exceed 11. If the plan implies more, consolidate.\n - VERTICAL SPINE per pane: \u2264 4\u20135 nodes counting branch leaves. A pane's longest top-to-bottom chain must be short. If a pane needs 6+ nodes, cut or merge.\n - At most 2 panes. Two panes already nearly overflow horizontally; never use 3.\n - LABELS: \u2264 ~60 characters, ONE line strongly preferred. Long labels create tall boxes that push everything down off-canvas. Two-line max, never three.\n\n=================================================================\nOUTPUT SCHEMA\n{\n \"direction\": \"TB\" | \"LR\",\n \"nodes\": [\n { \"id\": \"<unique-id>\", \"data\": { \"label\": \"<string>\", \"status\": \"<status>\" }, \"group\": \"<groupId>\" }\n ],\n \"edges\": [\n { \"source\": \"<id>\", \"target\": \"<id>\", \"data\": { \"label\": \"<condition/trigger>\" } }\n ],\n \"groups\": [\n { \"id\": \"<groupId>\", \"label\": \"<pane title>\", \"color\": \"<hex>\" }\n ]\n}\n- `status` values: \"info\" (entry/trigger points), \"active\" (process/action steps), \"neutral\" (decision/evaluation nodes or neutral terminals), \"warn\" (failure/error/escalation/abort), \"success\" (successful terminals).\n- Edge `data.label`: use ONLY for conditions/branch outcomes (1\u20133 words). Unconditional sequential edges get NO label.\n- Every node belongs to a group. Every group needs a distinct color.\n\n=================================================================\nHARD SIZE LIMITS\n\nL1. TOTAL NODES \u2264 11, aim 8\u201310. Fold pure \"action then next\" steps into the following node's label. Drop only out-of-scope detail; never silently drop an in-scope decision branch or terminal.\n\nL2. AT MOST 2 PANES. Never 3. If the plan defines a third pane only for shared failure/abort terminals, instead keep each abort terminal as a small leaf inside the pane that triggers it.\n\nL3. LABELS \u2264 ~60 chars, one line preferred. Pack the single most decision-relevant fact (one number OR one actor, not both if it forces a second line). Drop articles, filler, restated context. Prefer \"OCSP revoked? (hard-fail)\" over a sentence. Do NOT cram multiple parenthetical clauses into one label. Terminal labels \u2264 ~50 chars \u2014 terminals are the most-clipped nodes.\n\nL4. WIDTH & HEIGHT CONTROL.\n - Keep each pane's longest vertical chain to \u2264 4\u20135 nodes (including any branch leaf). This is the most important rule for staying on-canvas.\n - Each decision branches to AT MOST one side-leaf plus the main next node. Place the side-leaf immediately adjacent to its decision.\n - NEVER create one shared distant \"sink\" node that many decisions point to \u2014 those long edges cross unrelated nodes. Give each failing decision its OWN adjacent terminal leaf.\n - If a pane would exceed the spine limit, consolidate sequential steps \u2014 do not extend the column.\n\n=================================================================\nLAYOUT RULES\n\n1. SPLIT INTO PANES per the plan (\u2264 2 groups), each a distinct hex color. ~4\u20135 nodes per pane.\n\n2. NO EDGE-OVER-NODE CROSSINGS. An edge to a NON-ADJACENT node visually cuts across nodes between them.\n - Order nodes so edges connect ADJACENT nodes along a linear main path.\n - CRITICAL ORDERING TRAP (caused a documented failure): when a decision D branches yes\u2192next and no\u2192leaf, the leaf must sit ADJACENT to D, and `next` must not be skipped over by the leaf edge. E.g., if you place D then `next` then `leaf`, the edge D\u2192leaf jumps over `next` (FAIL). Place the failure leaf IMMEDIATELY after its decision, before the main-path continuation, OR verify neither branch edge skips the other's target.\n - Loop-back/back edges only if source and target are truly adjacent. If a back-edge jumps over ANY node, do NOT draw it \u2014 restructure as forward progress or terminate at an adjacent leaf.\n\n3. DIRECTION: default \"TB\". Use \"LR\" only for a short pane with several small parallel side-branches.\n\n4. CROSS-PANE HANDOFF: do NOT draw edges between panes. Indicate the handoff in a node label (e.g., \"\u2192 Pane B\"). Pane B has its own entry node.\n\n=================================================================\nEDGE-LABEL CLARITY (repeatedly scored 0.00\u20130.78 \u2014 a major loss)\nEdge labels were flagged failing even when present. To maximize edge_clarity:\n - Put a short (1\u20133 word) outcome label on EVERY edge leaving a decision node: \"pass\"/\"fail\", \"hit\"/\"miss\", \"yes\"/\"no\", \"valid\"/\"invalid\", \"in-stock\"/\"0 on-hand\", \"approve\"/\"reject\".\n - Do NOT label plain sequential edges \u2014 extra labels clutter and lower clarity.\n - Keep decision-edge labels terse; avoid embedding \"\u2192 Pane B\" inside an edge label (put handoff text in the node instead).\n - Fewer total edges + fewer nodes makes each label more legible. This is another reason to keep node count low.\n\n=================================================================\nCONTENT / LABEL RULES (keep TERSE)\nA guardrail checks whether a first-time reader can answer key questions FROM THE IMAGE. The fix for low comprehension is NOT longer labels \u2014 it is FEWER, SHORTER nodes that ALL stay visible. A detailed node that falls off-canvas scores zero.\n\n5. Each kept label carries ONE key concrete fact from the plan, compactly:\n - A number when it drives a decision: \"timeout 200ms\", \"retry 2x\", \"TTL 300s\", \"depth 5\", \"SLA 2 biz days\", \"on-hand \u22651?\".\n - WHO performs the step as a short prefix: \"CI:\", \"CD:\", \"PDP:\", \"PEP:\", \"SME:\", \"Tech:\". Readers consistently ask which actor owns a step.\n - For gates, enumerate criteria abbreviated: \"0 errors + 4 req fields?\", \"sig+expiry valid?\".\n - Pick the SINGLE most decision-relevant fact; do not list all variants if it forces a second line.\n\n6. DECISION NODES: phrase as a question; label each outgoing edge with the outcome (see edge-label section).\n\n7. TERMINAL STATES: explicit nodes with correct status. Prefix \"TERMINAL:\" (good/neutral end) or \"ABORT:\" (failure). Keep \u2264 ~50 chars. Include in-scope failure/edge paths as explicit leaves: timeouts, retries-exhausted, dependency-down/fail-closed, rollback/abort, escalation-on-SLA. But keep them as small adjacent leaves.\n\n8. COVER THE PLAN'S SCOPE within the node budget: entry trigger, key steps, every in-scope decision/branch, all terminal states. When over budget, consolidate sequential steps \u2014 never drop an in-scope branch. Do not invent steps beyond the plan.\n\n=================================================================\nPROCESS (do before emitting)\n- Map plan panes \u2192 \u2264 2 groups, distinct hex colors.\n- List required nodes. If > 11, consolidate (fold pure action steps into neighbors) until 8\u201310.\n- Within each pane, order nodes so the longest vertical chain is \u2264 4\u20135 nodes and every edge connects adjacent nodes.\n- Give each failing decision its OWN adjacent terminal leaf \u2014 no distant shared sink.\n- Verify the ORDERING TRAP: for each decision, confirm neither branch edge skips the other branch's target.\n- Eliminate every back-edge that skips a node.\n- Shorten every label to \u2264 ~60 chars / one line, keeping actor prefix + one key fact.\n- Put a 1\u20133 word outcome label on every decision edge; none on plain sequential edges.\n- Final render check (assume auto-layout spreads nodes and places panes side by side): Is any pane's vertical chain > 5 nodes (will run off bottom)? Are there > 2 panes? Would any edge cross an unrelated node? Would any leaf/terminal land off-canvas? If in ANY doubt, REMOVE or MERGE nodes and SHORTEN labels before emitting.\n\nOutput ONLY the final JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/geomvq_balanced/report.md b/scripts/experiments/gepa-flowchart/overnight/geomvq_balanced/report.md new file mode 100644 index 0000000..6fd9419 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/geomvq_balanced/report.md @@ -0,0 +1,16 @@ +# GEPA flowchart optimization — geomvq_balanced (RECOVERED from gepa_state) + +- iterations: 12 total evals: 190 candidates: 7 + +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) | 0.5881 | 0.000 | 0.468 | 0.840 | [None] | +| 1 | 0.5620 | 0.000 | 0.457 | 0.836 | [0] | +| 2 | 0.5906 | 0.000 | 0.470 | 0.860 | [1] | +| 3 | 0.5730 | 0.000 | 0.468 | 0.857 | [2] | +| 4 **BEST** | 0.6195 | 0.000 | 0.526 | 0.827 | [2] | +| 5 | 0.5769 | 0.000 | 0.488 | 0.858 | [4] | +| 6 | 0.6134 | 0.000 | 0.519 | 0.850 | [5] | + +**Seed 0.5881 → Best (idx 4) 0.6195 (+0.0313)** + diff --git a/scripts/experiments/gepa-flowchart/overnight/geomvq_geomheavy/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/geomvq_geomheavy/best_prompts.json new file mode 100644 index 0000000..a4a671c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/geomvq_geomheavy/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a downstream diagram generator (a React Flow-style auto-layout engine). Your plan's quality is judged on TWO things, BOTH of which must score well:\n(1) COMPREHENSION \u2014 a newcomer can answer specific factual questions from the rendered diagram, and\n(2) VISUAL RENDERING \u2014 the diagram fits on one screen with no clipped/off-canvas nodes, no overlapping node pairs, and no edges that cross unrelated nodes.\n\nCRITICAL LESSONS FROM PAST FAILURES (these are measured, real outcomes \u2014 obey them):\n- Plans with 12\u201314 nodes STILL pushed 4\u20135 nodes off-canvas and produced 1\u20133 overlapping node pairs, scoring only 0.41\u20130.58. The 18-node ceiling is NOT safe. The renderer clips aggressively. You must plan SMALLER and SHORTER than feels complete.\n- Long node labels get clipped or fail legibility. Labels like \"Calc next version from last tag (2.4.1+feat=2.5.0)\" and \"TERMINAL: scaled & healthy \u2014 auto-resolved, ticket logged\" failed. Keep every node label SHORT (target \u2264 5 words / \u2264 40 characters). Put the concrete values in the LEGEND, not in the node label.\n- Edge labels are NOT reliably readable. Branch conditions and ordering placed only on edges (\"a3\u2192a6: only docs/chore\", \"n3\u2192n7: at cap / cooldown\") were marked \"not clearly readable.\" Do not rely on edge labels to convey quiz-critical facts.\n- Even decision\u2192side-leaf routing produced edge-over-node overlaps in practice. Minimize the number of side branches per pane and keep terminals visually isolated.\n\nA smaller plan that fully renders beats a complete plan that clips. When in doubt, CUT.\n\nInputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nProduce a concise plain-text plan describing what the flowchart must show. Follow ALL rules below.\n\n============================================================\n=== HARD RENDERING CONSTRAINTS ===\n============================================================\n\n1. TOTAL NODE BUDGET ACROSS THE WHOLE PLAN: 10\u201314 nodes. Treat 14 as a hard maximum, and aim for 10\u201312. Count EVERY box \u2014 every step, decision, terminal, and entry \u2014 summed across ALL panes. (Past plans at 13\u201314 nodes still clipped; do not push the ceiling.)\n\n2. PER-PANE BUDGET: each pane is 4\u20136 nodes. Prefer 5. A pane with 7 nodes will clip \u2014 never plan 7.\n\n3. PREFER TWO PANES. Two well-rendered panes outscore three clipped ones. Only plan 3 panes if the grand total stays \u2264 14 (i.e., ~4\u20135 each) AND each pane is genuinely independent. Never plan 4 panes. If the topic does not fit, push whole sub-topics to \"Out of Scope.\"\n\n4. KEEP NODE LABELS SHORT. Target \u2264 5 words / \u2264 40 characters per node label. No parentheticals with example values inside node labels. No multi-clause terminal descriptions. The concrete numbers, examples, and mappings go in the LEGEND (section 2), which does not consume the node budget and is fully answerable.\n\n5. DO NOT PUT QUIZ-CRITICAL FACTS ON EDGE LABELS. Edge labels may carry a 1\u20132 word branch tag (\"yes\"/\"no\"/\"quorum lost\"), but any ordering, threshold, actor, or mapping a reader will be quizzed on must be encoded in a NODE label or the LEGEND. Assume edge labels may be unreadable.\n\n6. NO EDGE-OVER-NODE GEOMETRY. The most common crash was a decision branching to its \"happy path\" successor while a side/failure node sat physically between them.\n - For any decision: route the MAIN/continue branch to the IMMEDIATELY-next node in reading order. Route the SIDE/failure branch to a TERMINAL leaf (no further outgoing edges) placed off to the side.\n - Failure/retry/escalation branches must END in a clearly-labeled terminal leaf. They must NOT route back into the main column past other nodes.\n - No edge may skip over an intermediate node to reach a later one. Every edge connects adjacent nodes in reading order, or goes to a side leaf.\n - LIMIT SIDE BRANCHES: at most 2 side-leaf terminals per pane. More side branches \u2192 more overlap risk.\n\n7. PANES ARE INDEPENDENT, SELF-CONTAINED DIAGRAMS. Each pane has its own Entry/Trigger and its own Terminal States. Inter-pane references are a LABEL only (\"\u2192 continues to Pane B\"), never a drawn edge. No edge may span panes or jump into the middle of another pane.\n\n8. NO LONG-RANGE OR BACK-JUMPING EDGES. Loop-backs/retries connect to an adjacent node or resolve into a terminal \"retry exhausted \u2192 escalate\" leaf \u2014 never an arrow spanning the chart.\n\n9. KEEP TERMINALS FEW: 2\u20133 per pane, each a leaf.\n\n10. Suggest top-to-bottom layout, or let the engine choose. Do not over-specify.\n\n============================================================\n=== CONTENT: BE CONCRETE, NOT GENERIC ===\n============================================================\nWithin the tight budget, the SHORT node labels carry flow logic; the LEGEND carries the concrete detail readers are quizzed on. Identify the factual questions the Purpose implies (triggers, ordering, actors, mappings, failure paths, AND recovery/rejoin paths) and make sure each is answerable from either a node or the legend.\n\nSpecify in the LEGEND (with concrete representative values):\n- TRIGGERS WITH VALUES: real thresholds/durations/counts (e.g., \"CPU > 80% sustained 5 min\", \"3 missed heartbeats / 15s timeout\", \"quorum = N/2+1\"). Never say \"a threshold\" or \"a grace window.\"\n- EVALUATING ACTOR per decision: name WHO/WHAT evaluates (the node, leader/coordinator, quorum service, CI, on-call, IC). State automated vs. human-in-the-loop EXPLICITLY \u2014 past plans lost points for not depicting a human approval gate where one exists.\n- STATE/DATA DETAILS: who creates/increments/persists/validates, where stored, where checked or causes rejection.\n- FAILURE PATHS: for every fallible action/external dependency, specify behavior \u2014 block, retry (count + backoff), escalate (to whom / after what timeout), or fail-safe/rollback. Each failure branch terminates in an explicit leaf node.\n- RECOVERY / REJOIN / REVERSE paths: explicitly cover the \"undo\" or \"come back\" side that readers ask about \u2014 e.g., scale-DOWN + anti-flapping cooldown (not just scale-up), rejoin/state-reconciliation for an isolated node, pre-release vs. stable routing, rollback after failed publish. If such a path exists in the topic, either depict it as a node/branch or state its conditions explicitly in the legend. Do not leave the reverse direction implied.\n- ORDERING: the legend and node sequence must make the order of major operations explicit (don't rely on edge labels).\n- MAPPING TABLES / LEGENDS: list FULL fixed mappings inline in the legend (commit prefix \u2192 bump, severity tiers, category \u2192 queue, impact\u00d7urgency \u2192 priority, etc.). These live in text, not as nodes.\n- NEWCOMER CONTEXT: short definitions of domain terms with concrete values.\n\n============================================================\n=== OUTPUT FORMAT (plain text, labeled sections) ===\n============================================================\n1. Purpose & Scope (1\u20132 sentences).\n2. Context for Newcomers \u2014 legend: terms + ALL fixed mappings/tables with concrete values + any reverse/recovery conditions not drawn as nodes.\n3. Panes \u2014 name each pane, what it covers, its node count. Then state the GRAND TOTAL and confirm it is \u2264 14 (ideally \u2264 12).\n4. For each pane:\n - Entry/Trigger (concrete condition),\n - Key Steps/States \u2014 ordered, with SHORT node labels (\u2264 5 words); for each, note the responsible actor and automated vs. manual,\n - Decisions & Branch Triggers \u2014 each: condition; evaluating actor; main branch \u2192 immediately-next node; each failure/timeout/side branch \u2192 a named leaf terminal,\n - Terminal States (2\u20133 leaves, short labels),\n - Confirm: no edge skips an intermediate node; \u2264 2 side leaves; no edge crosses into another pane; no quiz-critical fact lives only on an edge label.\n5. Out of Scope \u2014 explicitly list excluded panes/details, including anything dropped to stay within budget.\n\nBEFORE FINALIZING:\n- Re-count every node across all panes. If > 14, cut (drop a pane to Out of Scope or merge steps) until \u2264 14; aim for \u2264 12.\n- Check every node label is short (\u2264 ~5 words). Move long detail/examples into the legend.\n- Verify no quiz-critical fact is carried only by an edge label.\n- Verify every failure branch ends in a leaf, \u2264 2 side leaves per pane, and no edge runs past an intermediate node or into another pane.\n- Confirm the reverse/recovery direction relevant to the topic is covered (in a node or the legend).\nPrioritize fitting on one screen and short legible labels over volume. A 10-node plan that renders fully beats a 14-node plan that clips.", + "generate": "You generate a flowchart `flow` diagram as JSON that renders cleanly in a React Flow canvas and is readable as a SINGLE screenshot without panning or zooming.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Output ONLY the JSON (no prose, no code fences).\n\n=================================================================\nTOP PRIORITY: THE IMAGE MUST FIT ON SCREEN\nPast diagrams were logically perfect but FAILED visually. The recurring, score-killing problems were:\n - Nodes pushed OFF the visible canvas (most common \u2014 happened in nearly every example).\n - Node-pair OVERLAPS (boxes rendered on top of each other).\n - Boxes CLIPPED at the right/bottom edges, cutting off label text.\n - Overly long labels forced into tall, narrow, multi-line boxes that overflow and crowd.\n - Three panes spread the graph too wide so the rightmost pane was cut off.\n\nThese are caused by (a) too many nodes, (b) labels that are too long, and (c) layouts that grow too wide/tall. The rules below exist to prevent exactly these failures. Treat compactness as more important than completeness of detail in any single label.\n\n=================================================================\nOUTPUT SCHEMA\n{\n \"direction\": \"TB\" | \"LR\",\n \"nodes\": [\n { \"id\": \"<unique-id>\", \"data\": { \"label\": \"<string>\", \"status\": \"<status>\" }, \"group\": \"<groupId>\" }\n ],\n \"edges\": [\n { \"source\": \"<id>\", \"target\": \"<id>\", \"data\": { \"label\": \"<condition/trigger>\" } }\n ],\n \"groups\": [\n { \"id\": \"<groupId>\", \"label\": \"<pane title>\", \"color\": \"<hex>\" }\n ]\n}\n- `status` values: \"info\" (entry/trigger points), \"active\" (process/action steps), \"neutral\" (decision/evaluation nodes or neutral terminals), \"warn\" (failure/error/escalation/abort), \"success\" (successful terminals).\n- Edge `data.label` is optional; use it ONLY for conditions/triggers/branch outcomes. Unconditional sequential edges need no label.\n- Every node must belong to a group. Every group needs a distinct color.\n\n=================================================================\nHARD SIZE LIMITS (do not exceed \u2014 these prevent off-canvas/overlap failures)\n\nL1. TOTAL NODES \u2264 15. Aim for 12\u201314. If the plan lists more, consolidate steps and drop only out-of-scope detail \u2014 never silently drop in-scope branches.\n\nL2. AT MOST 2 PANES. Two panes render within one screen; three panes consistently pushed content off-canvas in past renders. Only use a 3rd pane if the plan explicitly defines three AND the total stays \u2264 15 nodes with very short labels. If you would need a 3rd pane only to hold shared failure/abort terminals, instead keep each abort terminal as a leaf inside the pane that triggers it (see L4) rather than adding a pane.\n\nL3. LABEL LENGTH: keep each node label \u2264 ~90 characters and ideally on 1\u20132 lines. Long labels become tall narrow boxes that overflow. Pack the concrete facts (see CONTENT RULES) tersely; drop filler words, articles, and restated context. Prefer \"OCSP/CRL revoked? (hard-fail)\" over a full sentence. Use abbreviations the audience knows.\n\nL4. WIDTH CONTROL. A pane that fans out into many side-leaves grows too wide and clips. To stay narrow:\n - Keep the main flow a single vertical (TB) spine.\n - Each decision should branch to AT MOST one side-leaf (e.g., a failure terminal) plus the main next node. Place the side-leaf immediately beside its decision.\n - Do NOT create one shared distant \"sink\" node that many decisions point to from across the graph \u2014 those long edges cross unrelated nodes. Give each failing decision its OWN adjacent terminal leaf instead.\n\n=================================================================\nLAYOUT RULES (drive the geometry/visual score)\n\n1. SPLIT INTO PANES (groups) per the plan, but obey L2 (\u2264 2 panes). Each pane = one group. ~6\u20139 nodes per pane.\n\n2. NO EDGE-OVER-NODE CROSSINGS \u2014 a top failure. An edge from a node to a NON-ADJACENT node visually cuts across nodes between them.\n - Order nodes within a pane so edges connect ADJACENT nodes. Lay the main path out linearly in the flow direction.\n - Branch edges (decision \u2192 failure terminal) must go to an ADJACENT leaf. Never route a branch edge over 2+ intermediate nodes.\n - Loop-back edges (e.g., retry \u2192 earlier step) are allowed ONLY when source and target are adjacent/near-adjacent. If a back-edge would jump over 2+ nodes, restructure.\n - Concretely AVOID: `decisionA\u2192sink`, `decisionB\u2192sink`, `decisionC\u2192sink` where `sink` sits far away. Use per-decision adjacent terminals instead.\n\n3. DIRECTION. Default \"TB\" for mostly-linear flows. Use \"LR\" only if a pane is short and has many small parallel side-branches. Minimize crossings and overflow.\n\n4. CROSS-PANE HANDOFF: do NOT draw edges between panes. Reference the handoff in a label (e.g., \"\u2192 continues in Pane B\"). Pane B has its own entry node.\n\n=================================================================\nCONTENT / LABEL RULES (drive comprehension) \u2014 keep them TERSE per L3\n\n5. LABELS MUST BE SPECIFIC AND SELF-CONTAINED, but compact. Bake concrete facts from the plan directly into labels:\n - Concrete numbers: thresholds, timeouts, retries, backoffs, TTLs, %s, SLAs, quorums (e.g., \"timeout 150ms = 3 missed beats\", \"retry: nextIndex\u22121\", \"quorum N/2+1\", \"SLA 10 biz days, reminders d5/d8\", \"skew \u00b15min\").\n - WHO performs each step (actor/component): e.g., \"Leader evaluates\", \"Client TLS lib\", \"Compliance Officer\", \"Proc. Officer\". Readers consistently ask which actor owns a step.\n - For gates/checklists, ENUMERATE the actual criteria, abbreviated (e.g., \"trusted root + dates valid + not revoked + SAN match\", \"score\u226550, COI\u2265$1M, no sanctions match\"). Naming the gate alone is insufficient.\n - Comparison method / sample size when the plan gives it.\n - Failure/edge-case paths must appear as explicit NODES/EDGES, not be implied: timeouts, retries-exhausted, dependency unreachable, write failure, rollback/abort, escalation when SLA exceeded, stale-leader step-down, etc. (Trim these to short labels but keep them present.)\n\n6. DECISION NODES: phrase as a question; label EACH outgoing edge with the branch outcome (e.g., \"ACK\", \"REJECT\", \"out of range\", \"all docs \u226410d\", \"no common suite\"). Decision edge labels were noted as missing/faint in feedback \u2014 always include them.\n\n7. TERMINAL STATES: explicit nodes with proper status (\"success\" good ends, \"warn\" error ends, \"neutral\" held/archived). Prefix clearly: \"TERMINAL: ...\" or \"ABORT: ...\". Keep terminal labels short.\n\n8. COVER THE PLAN'S SCOPE: entry triggers, key steps, every in-scope decision/branch, all terminal states. Don't invent steps beyond scope; don't drop in-scope branches. If a topic detail (e.g., snapshot fallback, HelloRetryRequest) is NOT in the plan's scope, do not add it.\n\n=================================================================\nPROCESS (follow before emitting)\n- Map plan panes \u2192 groups (\u2264 2), assign distinct hex colors.\n- List all required nodes; if > 15, consolidate until \u2264 15.\n- Within each pane, order nodes along the main spine so edges stay short and adjacent.\n- Give each failing decision its OWN adjacent terminal leaf (no distant shared sink, no abort pane).\n- Shorten every label to \u2264 ~90 chars / 1\u20132 lines while preserving the concrete numbers and actor.\n- Mentally re-check: Would any edge cross an unrelated node? Would the layout be wider than ~2 panes or taller than one screen? Would any box clip? If yes, restructure (fewer nodes, shorter labels, tighter branching) BEFORE emitting.\n\nOutput ONLY the final JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/geomvq_geomheavy/report.md b/scripts/experiments/gepa-flowchart/overnight/geomvq_geomheavy/report.md new file mode 100644 index 0000000..e74a59f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/geomvq_geomheavy/report.md @@ -0,0 +1,13 @@ +# GEPA flowchart optimization — geomvq_geomheavy (RECOVERED from gepa_state) + +- iterations: 6 total evals: 106 candidates: 4 + +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) | 0.5310 | 0.000 | 0.479 | 0.846 | [None] | +| 1 **BEST** | 0.5390 | 0.000 | 0.466 | 0.881 | [0] | +| 2 | 0.5172 | 0.000 | 0.442 | 0.891 | [0] | +| 3 | 0.5331 | 0.000 | 0.507 | 0.892 | [2] | + +**Seed 0.5310 → Best (idx 1) 0.5390 (+0.0079)** + diff --git a/scripts/experiments/gepa-flowchart/overnight/geomvq_strict/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/geomvq_strict/best_prompts.json new file mode 100644 index 0000000..bcb32b4 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/geomvq_strict/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a downstream diagram generator (a React Flow-style auto-layout engine). Your plan's quality is judged on TWO things, BOTH of which must score well:\n(1) COMPREHENSION \u2014 a newcomer can answer specific factual questions from the rendered diagram, and\n(2) VISUAL RENDERING \u2014 the diagram fits on one screen with no clipped/off-canvas nodes and no edges that cross unrelated nodes.\n\nCRITICAL LESSON FROM PAST FAILURES: Plans with perfect content still scored ~0.4 overall because the rendered board pushed 18\u201326 nodes off-canvas, clipped everything after the first pane, and produced edges that ran across unrelated nodes. The content was right but UNREADABLE. You must treat rendering geometry as a first-class constraint, not an afterthought. A smaller plan that fully renders beats a complete plan that clips.\n\nInputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nProduce a concise plain-text plan describing what the flowchart must show. Follow ALL rules below.\n\n============================================================\n=== HARD RENDERING CONSTRAINTS (these caused past failures) ===\n============================================================\n\n1. TOTAL NODE BUDGET ACROSS THE WHOLE PLAN: 12\u201318 nodes MAXIMUM, summed over ALL panes. The generator lays out every node you describe; past plans assumed each pane rendered separately and blew the budget (3 panes \u00d7 8\u20139 nodes = 24+ nodes \u2192 26 off-canvas). Count EVERY box you mention \u2014 including every decision, every terminal state, and every intermediate step \u2014 and keep the grand total at or below 18.\n\n2. PER-PANE BUDGET: each pane is 4\u20137 nodes. If you genuinely cannot fit the topic in 18 total nodes, KEEP ONLY the 2\u20133 panes that best serve the stated Purpose and explicitly drop the rest into \"Out of Scope.\" Do NOT silently include a fourth pane that will be clipped. It is better to fully cover 2 panes than to truncate 3.\n\n3. PREFER FEWER PANES. Two well-rendered panes outscore three clipped ones. Only use 3 panes if the total still fits in 18 nodes (i.e., ~6 nodes each). Never plan 4 panes.\n\n4. NO EDGE-OVER-NODE GEOMETRY. The single most common error was a decision node branching to its \"happy path\" successor while a failure/side node sat physically between them (e.g., `D1 \u2192 main_step` rendered ON TOP OF \"side_node\"). To prevent this:\n - For any decision with a \"main/continue\" branch and a \"side/failure\" branch, route the MAIN branch to the IMMEDIATELY-next node, and send the SIDE branch to a TERMINAL node placed off to the side (a leaf with no further outgoing edges). Failure/retry/escalation branches should END in a clearly-labeled terminal state, NOT route back into the main column past other nodes.\n - Do NOT create edges that skip over an intermediate node to reach a later one. Every edge should connect adjacent nodes in reading order or go to a leaf terminal.\n - Do NOT create edges that jump from one pane into the middle of another pane. Panes connect only via a single clean hand-off from the last node of one pane to the first node of the next (or, better, treat panes as fully independent \u2014 see #5).\n\n5. TREAT PANES AS INDEPENDENT, SELF-CONTAINED DIAGRAMS. Each pane should have its own Entry/Trigger and its own Terminal States. Inter-pane references should be a label (\"\u2192 continues to Pane B\"), NOT a drawn edge spanning panes. This keeps each pane laying out cleanly.\n\n6. NO LONG-RANGE OR BACK-JUMPING EDGES. Loop-backs/retries should connect to an adjacent node or resolve into a terminal \"retry exhausted \u2192 escalate\" leaf \u2014 never an arrow spanning the whole chart.\n\n7. KEEP TERMINAL STATES FEW (2\u20134 per pane) and place each as a leaf.\n\n8. Suggest a simple layout direction (usually top-to-bottom) OR let the engine choose. Do not over-specify layout.\n\n============================================================\n=== CONTENT: BE CONCRETE, NOT GENERIC ===\n============================================================\nWithin the tight node budget, every node you DO include must carry the specific detail a reader will ask about. When choosing what to keep, prioritize the nodes/details that answer the Purpose's implied factual questions (the reader will be quizzed on triggers, ordering, actors, mappings, and failure paths). For each step/decision/trigger, NAME the conventional value rather than describing it abstractly. Always specify:\n\n- TRIGGERS WITH VALUES: concrete thresholds, durations, counts, conditions (e.g., \"CPU > 80% sustained 5 min\", \"3 missed heartbeats / 15s timeout\", \"majority = N/2+1 nodes\"). State a representative real-world default; never say \"a threshold\" or \"a grace window.\"\n- EVALUATING ACTOR: for each decision, name WHO/WHAT evaluates it (the node, a leader/coordinator, a quorum service, CI, on-call engineer, incident commander). State explicitly automated vs. human-in-the-loop approval.\n- STATE/DATA DETAILS: who creates/increments/persists/validates it, where it's stored, at which step it's checked or causes rejection.\n- FAILURE PATHS: for every fallible action or external dependency, specify behavior \u2014 block, retry (how many / what backoff), escalate (to whom / after what timeout), or fail-safe/rollback. Each failure branch must terminate in an explicit leaf node; never leave it implied. (But remember: each such terminal counts against the node budget \u2014 keep them tight.)\n- ORDERING: state the explicit sequence of major operations, AND mark which steps are automated vs. manual where the audience would ask.\n- MAPPING TABLES / LEGENDS: when the topic has a fixed mapping (commit prefix \u2192 version bump, severity tiers, category \u2192 queue, impact\u00d7urgency \u2192 priority), list the FULL mapping inline IN THE LEGEND. Mapping tables live in the text legend, not as separate nodes, so they don't consume the node budget but are still answerable.\n- NEWCOMER CONTEXT: a short legend defining domain terms the audience-newcomer needs, with concrete values.\n\nPush exhaustive mappings, rate tables, SLA tables, and definitions into the LEGEND text (section 2). Reserve actual flowchart NODES for the flow logic only. This is how you stay within the node budget while keeping comprehension high.\n\n============================================================\n=== OUTPUT FORMAT (plain text, labeled sections) ===\n============================================================\n1. Purpose & Scope (1\u20132 sentences).\n2. Context for Newcomers \u2014 legend: terms + ALL fixed mappings/tables with concrete values (this carries comprehension detail without using nodes).\n3. Panes \u2014 name each pane, what it covers, and its node count. Then state the GRAND TOTAL node count and confirm it is \u2264 18.\n4. For each pane: \n - Entry/Trigger (concrete condition),\n - Key Steps/States (ordered, concrete details + responsible actor, automated vs manual),\n - Decisions & Branch Triggers (each: specific threshold/condition; evaluating actor; every branch's destination; main branch \u2192 next node, failure/timeout/retry/escalation branch \u2192 a leaf terminal),\n - Terminal States (leaves, 2\u20134).\n - Explicitly confirm no edge skips over an intermediate node and no edge crosses into another pane.\n5. Out of Scope \u2014 state explicitly what is excluded, INCLUDING any panes/details you dropped to stay within the node budget.\n\nBefore finalizing: re-count every node across all panes. If the total exceeds 18, cut content (drop a pane to Out of Scope or merge steps) until it fits. Verify every failure branch ends in a leaf and no edge runs past an intermediate node. Prioritize concrete values, named actors, and explicit failure handling over volume \u2014 but never at the cost of fitting on one screen.", + "generate": "You generate a flowchart `flow` diagram as JSON that renders cleanly in a React Flow canvas and is readable as a SINGLE screenshot without panning or zooming.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Output ONLY the JSON (no prose, no code fences).\n\n=================================================================\nTOP PRIORITY: THE IMAGE MUST FIT ON SCREEN\nPast diagrams were logically perfect but FAILED visually. The recurring, score-killing problems were:\n - Nodes pushed OFF the visible canvas (most common \u2014 happened in nearly every example).\n - Node-pair OVERLAPS (boxes rendered on top of each other).\n - Boxes CLIPPED at the right/bottom edges, cutting off label text.\n - Overly long labels forced into tall, narrow, multi-line boxes that overflow and crowd.\n - Three panes spread the graph too wide so the rightmost pane was cut off.\n\nThese are caused by (a) too many nodes, (b) labels that are too long, and (c) layouts that grow too wide/tall. The rules below exist to prevent exactly these failures. Treat compactness as more important than completeness of detail in any single label.\n\n=================================================================\nOUTPUT SCHEMA\n{\n \"direction\": \"TB\" | \"LR\",\n \"nodes\": [\n { \"id\": \"<unique-id>\", \"data\": { \"label\": \"<string>\", \"status\": \"<status>\" }, \"group\": \"<groupId>\" }\n ],\n \"edges\": [\n { \"source\": \"<id>\", \"target\": \"<id>\", \"data\": { \"label\": \"<condition/trigger>\" } }\n ],\n \"groups\": [\n { \"id\": \"<groupId>\", \"label\": \"<pane title>\", \"color\": \"<hex>\" }\n ]\n}\n- `status` values: \"info\" (entry/trigger points), \"active\" (process/action steps), \"neutral\" (decision/evaluation nodes or neutral terminals), \"warn\" (failure/error/escalation/abort), \"success\" (successful terminals).\n- Edge `data.label` is optional; use it ONLY for conditions/triggers/branch outcomes. Unconditional sequential edges need no label.\n- Every node must belong to a group. Every group needs a distinct color.\n\n=================================================================\nHARD SIZE LIMITS (do not exceed \u2014 these prevent off-canvas/overlap failures)\n\nL1. TOTAL NODES \u2264 15. Aim for 12\u201314. If the plan lists more, consolidate steps and drop only out-of-scope detail \u2014 never silently drop in-scope branches.\n\nL2. AT MOST 2 PANES. Two panes render within one screen; three panes consistently pushed content off-canvas in past renders. Only use a 3rd pane if the plan explicitly defines three AND the total stays \u2264 15 nodes with very short labels. If you would need a 3rd pane only to hold shared failure/abort terminals, instead keep each abort terminal as a leaf inside the pane that triggers it (see L4) rather than adding a pane.\n\nL3. LABEL LENGTH: keep each node label \u2264 ~90 characters and ideally on 1\u20132 lines. Long labels become tall narrow boxes that overflow. Pack the concrete facts (see CONTENT RULES) tersely; drop filler words, articles, and restated context. Prefer \"OCSP/CRL revoked? (hard-fail)\" over a full sentence. Use abbreviations the audience knows.\n\nL4. WIDTH CONTROL. A pane that fans out into many side-leaves grows too wide and clips. To stay narrow:\n - Keep the main flow a single vertical (TB) spine.\n - Each decision should branch to AT MOST one side-leaf (e.g., a failure terminal) plus the main next node. Place the side-leaf immediately beside its decision.\n - Do NOT create one shared distant \"sink\" node that many decisions point to from across the graph \u2014 those long edges cross unrelated nodes. Give each failing decision its OWN adjacent terminal leaf instead.\n\n=================================================================\nLAYOUT RULES (drive the geometry/visual score)\n\n1. SPLIT INTO PANES (groups) per the plan, but obey L2 (\u2264 2 panes). Each pane = one group. ~6\u20139 nodes per pane.\n\n2. NO EDGE-OVER-NODE CROSSINGS \u2014 a top failure. An edge from a node to a NON-ADJACENT node visually cuts across nodes between them.\n - Order nodes within a pane so edges connect ADJACENT nodes. Lay the main path out linearly in the flow direction.\n - Branch edges (decision \u2192 failure terminal) must go to an ADJACENT leaf. Never route a branch edge over 2+ intermediate nodes.\n - Loop-back edges (e.g., retry \u2192 earlier step) are allowed ONLY when source and target are adjacent/near-adjacent. If a back-edge would jump over 2+ nodes, restructure.\n - Concretely AVOID: `decisionA\u2192sink`, `decisionB\u2192sink`, `decisionC\u2192sink` where `sink` sits far away. Use per-decision adjacent terminals instead.\n\n3. DIRECTION. Default \"TB\" for mostly-linear flows. Use \"LR\" only if a pane is short and has many small parallel side-branches. Minimize crossings and overflow.\n\n4. CROSS-PANE HANDOFF: do NOT draw edges between panes. Reference the handoff in a label (e.g., \"\u2192 continues in Pane B\"). Pane B has its own entry node.\n\n=================================================================\nCONTENT / LABEL RULES (drive comprehension) \u2014 keep them TERSE per L3\n\n5. LABELS MUST BE SPECIFIC AND SELF-CONTAINED, but compact. Bake concrete facts from the plan directly into labels:\n - Concrete numbers: thresholds, timeouts, retries, backoffs, TTLs, %s, SLAs, quorums (e.g., \"timeout 150ms = 3 missed beats\", \"retry: nextIndex\u22121\", \"quorum N/2+1\", \"SLA 10 biz days, reminders d5/d8\", \"skew \u00b15min\").\n - WHO performs each step (actor/component): e.g., \"Leader evaluates\", \"Client TLS lib\", \"Compliance Officer\", \"Proc. Officer\". Readers consistently ask which actor owns a step.\n - For gates/checklists, ENUMERATE the actual criteria, abbreviated (e.g., \"trusted root + dates valid + not revoked + SAN match\", \"score\u226550, COI\u2265$1M, no sanctions match\"). Naming the gate alone is insufficient.\n - Comparison method / sample size when the plan gives it.\n - Failure/edge-case paths must appear as explicit NODES/EDGES, not be implied: timeouts, retries-exhausted, dependency unreachable, write failure, rollback/abort, escalation when SLA exceeded, stale-leader step-down, etc. (Trim these to short labels but keep them present.)\n\n6. DECISION NODES: phrase as a question; label EACH outgoing edge with the branch outcome (e.g., \"ACK\", \"REJECT\", \"out of range\", \"all docs \u226410d\", \"no common suite\"). Decision edge labels were noted as missing/faint in feedback \u2014 always include them.\n\n7. TERMINAL STATES: explicit nodes with proper status (\"success\" good ends, \"warn\" error ends, \"neutral\" held/archived). Prefix clearly: \"TERMINAL: ...\" or \"ABORT: ...\". Keep terminal labels short.\n\n8. COVER THE PLAN'S SCOPE: entry triggers, key steps, every in-scope decision/branch, all terminal states. Don't invent steps beyond scope; don't drop in-scope branches. If a topic detail (e.g., snapshot fallback, HelloRetryRequest) is NOT in the plan's scope, do not add it.\n\n=================================================================\nPROCESS (follow before emitting)\n- Map plan panes \u2192 groups (\u2264 2), assign distinct hex colors.\n- List all required nodes; if > 15, consolidate until \u2264 15.\n- Within each pane, order nodes along the main spine so edges stay short and adjacent.\n- Give each failing decision its OWN adjacent terminal leaf (no distant shared sink, no abort pane).\n- Shorten every label to \u2264 ~90 chars / 1\u20132 lines while preserving the concrete numbers and actor.\n- Mentally re-check: Would any edge cross an unrelated node? Would the layout be wider than ~2 panes or taller than one screen? Would any box clip? If yes, restructure (fewer nodes, shorter labels, tighter branching) BEFORE emitting.\n\nOutput ONLY the final JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/geomvq_strict/report.md b/scripts/experiments/gepa-flowchart/overnight/geomvq_strict/report.md new file mode 100644 index 0000000..2986ff2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/geomvq_strict/report.md @@ -0,0 +1,13 @@ +# GEPA flowchart optimization — geomvq_strict (RECOVERED from gepa_state) + +- iterations: 7 total evals: 112 candidates: 4 + +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) **BEST** | 0.6102 | 0.000 | 0.484 | 0.870 | [None] | +| 1 | 0.5994 | 0.000 | 0.469 | 0.868 | [0] | +| 2 | 0.5192 | 0.000 | 0.469 | 0.888 | [0] | +| 3 | 0.4984 | 0.000 | 0.416 | 0.783 | [2] | + +**Seed 0.6102 → Best (idx 0) 0.6102 (+0.0000)** + diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_cloud-architecture/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/infra_journey_cloud-architecture/best_prompts.json new file mode 100644 index 0000000..3f4c603 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_cloud-architecture/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'cloud-architecture' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Resources are grouped into labeled zones (VPC, public/private subnets, tiers)\n- Each resource is a labeled node showing its service/type\n- Directed edges show the request/traffic flow between components\n- The internet/user entry point and the cloud boundary are shown\n- Managed services (DB, queue, cache) are distinguished from compute\n- A legend or tones explain the zone/resource types\n- Nodes and zones do not overlap; edges avoid unrelated nodes\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_cloud-architecture/report.md b/scripts/experiments/gepa-flowchart/overnight/infra_journey_cloud-architecture/report.md new file mode 100644 index 0000000..5b08534 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_cloud-architecture/report.md @@ -0,0 +1,5 @@ +# GEPA journey: cloud-architecture + +rubric criteria: 7; val: 2 + +**Seed 0.1306 → Best 0.1306 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_infra-cost-breakdown/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/infra_journey_infra-cost-breakdown/best_prompts.json new file mode 100644 index 0000000..af916b7 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_infra-cost-breakdown/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'infra-cost-breakdown' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nProduce a concise plain-text plan that names SPECIFIC content \u2014 real dollar values, real service/category labels, concrete titles, and explicit chart structure. Generic plans fail; every criterion must be satisfied with concrete, named content.\n\nThis is judged against a STRICT rubric. A great board MUST satisfy EVERY criterion below. For each, follow the guidance:\n\n1. CHART TYPE \u2014 A horizontal bar chart (or stacked horizontal bar) of cost by service/resource category. One bar per category. Prefer horizontal bars so long category labels and value labels fit.\n\n2. SORTED BY COST (this criterion has repeatedly failed \u2014 treat it as the #1 priority):\n - The bars MUST be ordered largest cost at the TOP, descending to smallest at the bottom.\n - State the sort EXPLICITLY and unambiguously in TWO places:\n (a) In the chart-type/structure description: \"Bars ordered top\u2192bottom by descending total cost.\"\n (b) As a literal ordered list of the bars in rendered top-to-bottom sequence, each with its value, so the descending order is visually verifiable (e.g., line 1 = highest cost, last line = lowest).\n - Do NOT rely on a table alone to imply sorting \u2014 write out the ordered bar sequence as it will actually appear on the chart, and confirm each value is smaller than the one above it.\n - If there is a long tail, bucket small items into a single \"Other (N services)\" bar placed last.\n\n3. COST AXIS TITLE + CURRENCY UNITS:\n - Give the value axis an explicit title that includes the currency, e.g., \"Monthly Cost (USD $)\".\n - Specify concrete tick marks with units (e.g., $0, $5K, $10K, $15K, $20K, $25K).\n\n4. CATEGORY LABELS \u2014 Each category/service is clearly labeled (real names like EC2, RDS, S3, Data Transfer, Lambda, Compute Engine, Cloud SQL, BigQuery, etc.). Label each bar directly. Give the category axis a title.\n\n5. VALUE LABELS + TOTAL:\n - Put a value label at the end of each bar showing the dollar amount (and optionally % of total), e.g., \"$21,400 (44%)\".\n - Show a grand TOTAL prominently (KPI box or subtitle), e.g., \"Total: $48,250 USD\". The sum must equal the sum of the bars.\n\n6. HIGHLIGHT TOP COST DRIVER (this has scored partial credit \u2014 make it unambiguous):\n - The single largest bar (which, given correct sorting, is the TOP bar) must be visually distinct: a saturated accent color (e.g., orange) while all other bars use a muted neutral color. State both colors.\n - Add a callout annotation on that bar naming it, its dollar value, and its % of total, plus a short action, e.g., \"Top driver \u2014 EC2 $21,400 (44% of spend). Review Reserved/Savings Plans.\"\n - Make the highlight a clearly singular, dominant visual treatment \u2014 not just a small marker.\n\n7. TITLE NAMES ACCOUNT/ENVIRONMENT + PERIOD:\n - The chart title must include BOTH a specific account/environment identifier AND a specific time period, e.g., \"AWS Monthly Cost by Service \u2014 prod-account (123456789012) \u2014 March 2025\".\n\nDomain notes to use for realistic content:\n- AWS services: EC2/Compute, RDS/Databases, S3/Storage, Data Transfer/Egress, Lambda, EKS, CloudWatch/Logs.\n- GCP services: Compute Engine, Cloud SQL, BigQuery, Cloud Storage, Networking (Egress), Kubernetes Engine (GKE), Cloud Logging, Pub/Sub, Cloud Functions.\n- For environment comparisons (dev/staging/prod): use a stacked horizontal bar with one segment per environment, but STILL sort the bars by each category's TOTAL cost descending, and highlight the top bar.\n- Compute (EC2/Compute Engine) is typically the largest driver, followed by databases, then data transfer/storage.\n- Realistic costs after credits, before tax; cite a source (e.g., \"AWS Cost Explorer, unblended cost\" or \"GCP Billing Export \u2192 BigQuery\").\n\nKeep the plan tight and scannable to maximize comprehension: lead with the title, then the explicit top-to-bottom ordered bar list with values, then axes, value labels/total, and the highlight. Avoid redundant prose. Every number must be concrete and internally consistent.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_infra-cost-breakdown/report.md b/scripts/experiments/gepa-flowchart/overnight/infra_journey_infra-cost-breakdown/report.md new file mode 100644 index 0000000..5e63452 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_infra-cost-breakdown/report.md @@ -0,0 +1,5 @@ +# GEPA journey: infra-cost-breakdown + +rubric criteria: 7; val: 2 + +**Seed 0.4188 → Best 0.5272 (+0.1084)** diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_network-topology/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/infra_journey_network-topology/best_prompts.json new file mode 100644 index 0000000..5ba9a41 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_network-topology/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'network-topology' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- VPC(s) and their public/private subnets are shown as zones\n- Internet/NAT gateways and their placement are shown\n- Routing between subnets/gateways is indicated by edges or labels\n- Security groups / firewall rules are represented\n- Subnets/VPCs are labeled with their CIDR ranges\n- Edges show the allowed traffic direction\n- Zones and nodes do not overlap; edges avoid unrelated nodes\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_network-topology/report.md b/scripts/experiments/gepa-flowchart/overnight/infra_journey_network-topology/report.md new file mode 100644 index 0000000..7c54e07 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_network-topology/report.md @@ -0,0 +1,5 @@ +# GEPA journey: network-topology + +rubric criteria: 7; val: 2 + +**Seed 0.4274 → Best 0.4274 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-module-tree/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-module-tree/best_prompts.json new file mode 100644 index 0000000..710089b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-module-tree/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'terraform-module-tree' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- The root module is the top of the hierarchy\n- Child modules are nested under their calling module\n- Each module lists the resources it creates\n- Module inputs/outputs (or key variables) are indicated\n- Reused/shared modules are identifiable\n- The composition hierarchy reads top-down clearly\n- Title names the root configuration\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-module-tree/report.md b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-module-tree/report.md new file mode 100644 index 0000000..91c9c9e --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-module-tree/report.md @@ -0,0 +1,5 @@ +# GEPA journey: terraform-module-tree + +rubric criteria: 7; val: 2 + +**Seed 0.8397 → Best 0.8397 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-plan/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-plan/best_prompts.json new file mode 100644 index 0000000..1378d6c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-plan/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'terraform-plan' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Resources are grouped by action (create / update / replace / destroy)\n- Each action is color-coded (green add, yellow change, red destroy)\n- Each resource shows its address/type (e.g. aws_instance.web)\n- A summary shows counts: N to add, M to change, K to destroy\n- Notable changed attributes are listed per resource\n- Destructive (replace/destroy) changes are visually warned\n- Title names the environment/workspace being planned\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-plan/report.md b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-plan/report.md new file mode 100644 index 0000000..cc44479 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-plan/report.md @@ -0,0 +1,5 @@ +# GEPA journey: terraform-plan + +rubric criteria: 7; val: 2 + +**Seed 0.5744 → Best 0.5744 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-resource-graph/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-resource-graph/best_prompts.json new file mode 100644 index 0000000..c540f2a --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-resource-graph/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'terraform-resource-graph' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each resource/data source is a labeled node showing its type\n- Directed edges show resource dependencies (what depends on what)\n- The provider/root config is identifiable as a source node\n- The graph implies a clear creation order (sources before dependents)\n- Resources are visually distinguished by type or tone\n- The graph is laid out top-to-bottom\n- No edge passes through an unrelated resource node\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-resource-graph/report.md b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-resource-graph/report.md new file mode 100644 index 0000000..667bd86 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/infra_journey_terraform-resource-graph/report.md @@ -0,0 +1,5 @@ +# GEPA journey: terraform-resource-graph + +rubric criteria: 7; val: 2 + +**Seed 0.7299 → Best 0.7299 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_agent-trace/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_agent-trace/best_prompts.json new file mode 100644 index 0000000..801d241 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_agent-trace/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'agent-trace' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a CONCISE plain-text plan that names SPECIFIC content (real values, labels, structure) for an agent-trace board. Generic plans fail. The plan is judged on comprehension, visual quality, geometry (no overflow/clipping), and a strict rubric.\n\n== WHAT AN AGENT-TRACE BOARD IS ==\nA vertical timeline of an AI agent's execution steps. Each step has a \"kind\": one of think / tool-call / observation / result / error. Steps are connected top-to-bottom. The board lets a reader audit how the agent reasoned and reached its conclusion.\n\n== HARD RUBRIC \u2014 EVERY ITEM MUST BE EXPLICITLY SATISFIED ==\n1. VERTICAL TIMELINE of step items, top \u2192 bottom, connected.\n2. ITEMS COLORED BY KIND. Use this fixed mapping (state it explicitly):\n - think = purple\n - tool-call = blue\n - observation = gray\n - result = green\n - error = red\n3. EACH STEP CARRIES A SMALL KIND BADGE (a short text pill like `think`, `tool-call`, `observation`, `result`, `error`).\n4. WINDOW HEADER shows a step-range badge in the EXACT form 'steps X-Y of N' (e.g. 'steps 1-8 of 8'). Put the total step count and range consistently \u2014 N must equal the number of steps you list.\n5. EACH STEP HAS A DIMMED DETAIL LINE (one short line) describing what happened.\n6. A RED ALERT NEAR THE END names the root cause/result.\n7. THE LATEST/ACTIVE STEP IS VISUALLY EMPHASIZED.\n\n== CRITICAL FIXES (these caused past failures \u2014 do them exactly) ==\n\nKEEP IT SMALL TO AVOID OVERFLOW/CLIPPING:\n- Use ONLY 6 to 8 steps total. Do NOT exceed 8. Fewer, clearer steps score better than many cramped ones.\n- Keep every step TITLE under ~45 characters. Keep every DETAIL LINE under ~60 characters.\n- Do not add long legends, extra notes sections, or multiple banners that bloat the layout. One compact color legend line is enough.\n\nROOT-CAUSE RED ALERT (must be unmistakably detectable \u2014 this repeatedly scored 0.0):\n- Make it a SEPARATE, STANDALONE red alert element \u2014 NOT buried inside a step, NOT merged into the result step.\n- Place it as its own item NEAR THE END (immediately before or right at the final step).\n- Give it an explicit, literal label so it is obvious: start the text with `ROOT CAUSE:` (plain words, not just an emoji).\n- It must be red-colored/error-styled and state the concrete cause in one sentence.\n- Example: `RED ALERT (error-styled, red) \u2014 ROOT CAUSE: <one concrete sentence naming the cause>.`\n\nACTIVE/LATEST STEP MARKER (must be unmistakably detectable \u2014 this repeatedly scored 0.0):\n- Mark the FINAL step (the latest one) as active using a concrete, literal in-step marker, not just prose description.\n- Add the literal text `\u25cf active` (or `[ACTIVE]`) directly on that step's title/label line.\n- ALSO describe concrete visual emphasis on THAT step: brighter fill, thicker/accent border, glow ring, full opacity (while earlier steps are slightly dimmed).\n- State both the literal marker AND the visual emphasis explicitly so it cannot be missed.\n\n== STRUCTURE EACH STEP LIKE THIS ==\nFor each step give, on tight lines:\n- Step number + KIND (with its color) + kind badge text\n- Short title (concrete: real tool name/query/value)\n- Dimmed detail line (concrete real value: actual output, status code, count, file:line, etc.)\n\n== CONTENT QUALITY ==\n- Use realistic, specific values: real-looking tool calls (e.g. `query_logs(service='api-gateway', code=504)`), concrete observations (`pool.active = 20/20`), real file/line refs, HTTP codes, assertion diffs, etc.\n- The trace should clearly progress: plan \u2192 act \u2192 observe \u2192 narrow down \u2192 root cause \u2192 result, matching the use case.\n- The red alert's named cause and the final result step should be consistent.\n\n== OUTPUT FORMAT ==\nPlain text. Compact. Sections:\n1. Window header (title + 'steps X-Y of N' badge)\n2. Color legend (one line)\n3. The 6\u20138 step timeline (each step as above)\n4. The standalone RED ALERT \u2014 ROOT CAUSE element near the end\n5. Explicit note that the final step is the active step with `\u25cf active` marker + emphasis styling\n\nBe concrete and brief. No filler.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_agent-trace/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_agent-trace/report.md new file mode 100644 index 0000000..e96c392 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_agent-trace/report.md @@ -0,0 +1,5 @@ +# GEPA journey: agent-trace + +rubric criteria: 7; val: 2 + +**Seed 0.6145 → Best 0.7571 (+0.1426)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_algorithm-walkthrough/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_algorithm-walkthrough/best_prompts.json new file mode 100644 index 0000000..2dde8c8 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_algorithm-walkthrough/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'algorithm-walkthrough' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a CONCISE PLAIN-TEXT PLAN for a 3-pane board. Name SPECIFIC content \u2014 real code, real numbers, real labels, real structure. Generic plans fail. Every value must be concrete.\n\nThe plan is graded against a STRICT rubric. The criteria below have historically scored ZERO or HALF \u2014 they are your top priority. Read the FIX notes carefully.\n\n=== THE THREE PANES (must be VISUALLY DISTINCT \u2014 historically only 0.5) ===\nEach pane needs (a) its own BOLD title, AND (b) a CLEARLY DIFFERENT visual treatment. To score full marks, make the three treatments maximally different from each other along MULTIPLE axes \u2014 background color, border color, border style, and font family. State all of these explicitly per pane. Example contrast set:\n- Pane 1: dark charcoal bg (#1e1e2e) + thin CYAN border + monospace font.\n- Pane 2: light cream bg (#fdf6e3) + solid AMBER border + sans-serif, zebra-striped rows.\n- Pane 3: white bg + thick GREEN top border (4px) + rendered chart.\nDo not reuse the same color on two panes. Name the hex codes.\n\n--- PANE 1: CODE (markdown/code pane) ---\n- Bold label naming the algorithm + language.\n- Annotated Python source with inline numbered markers \u2460\u2461\u2462\u2026 on key lines.\n- Keep code SHORT: 8\u201312 lines max. No blank padding.\n- One short caption (\u22641 line) tying code structure to its Big-O.\n\n--- PANE 2: TRACE TABLE (component pane) ---\n- Bold label naming the concrete example: include the ACTUAL input array AND target/parameter.\n- ONE real markdown table only (do NOT include an \"alt\" or backup second table \u2014 that causes overflow). Pick a small example that fully traces in \u22646 rows.\n- First column is \"Step\", rows ordered ascending 1,2,3,\u2026 top-to-bottom. State this explicitly.\n- Columns: Step + the key state variables + a Comparison column + an Action column. Aim for \u22646 ROWS and \u22646 COLUMNS. Short cell values only.\n- The Action column values MUST cite the code's annotation markers (e.g., \"\u2463 return 5\", \"\u2465 low=mid+1\").\n- One short caption (\u22641 line).\n\n--- PANE 3: COMPLEXITY CURVE (vega-lite pane) ---\nThis pane has historically scored complexity_chart 0.0 AND chart_has_context 0.0 EVEN when a title, axis titles, and data were described in prose. FIX: Do NOT merely describe the chart in bullet points. Provide a COMPLETE, VALID, INLINE Vega-Lite JSON spec that a renderer could parse directly, containing ALL of:\n - \"mark\": {\"type\":\"line\",\"point\":true}\n - \"title\": a real string naming the algorithm AND its Big-O vs the contrast class, e.g. \"Binary Search O(log\u2082 n) vs Linear O(n)\"\n - \"data\":{\"values\":[ ... ]} with the ACTUAL numeric rows inlined (every {n, ops, series} object listed explicitly, both series).\n - encoding.x with \"field\":\"n\", \"type\":\"quantitative\", and \"axis\":{\"title\":\"Input size (n)\"}\n - encoding.y with \"field\":\"ops\",\"type\":\"quantitative\", and \"axis\":{\"title\":\"Operations / comparisons (worst case)\"}\n - encoding.color with \"field\":\"series\",\"type\":\"nominal\" and a legend (this is the legend distinguishing the two series).\n - A second layer or text/rule mark annotation calling out the gap at the largest n.\n Verify the JSON has title + x axis title + y axis title + numeric data values, or it scores 0.\n\n- Compute the data points CORRECTLY for the Big-O class. Use these reference values:\n - O(log n) with n=[10,100,1000,10000,100000] \u2192 \u2308log\u2082 n\u2309 = [4,7,10,14,17]\n - O(n): ops = n exactly.\n - O(n log n): n\u00b7\u2308log\u2082 n\u2309.\n - O(n\u00b2): ops = n\u00b2.\n- Always include a SECOND CONTRASTING reference series (e.g., the brute-force / naive class) with its own correct concrete data, plus a legend, plus an annotation explaining the performance gap (e.g., \"At n=100,000: 17 vs 100,000 comparisons\").\n- Use a SMALL number of points (5) and keep the spec compact.\n\n=== READS AS EXPLAINER (historically 0.5) ===\nAdd an explicit \"How the panes connect\" section that TEACHES a flow, not describes. It must explicitly:\n1. CODE \u2192 TRACE: state how specific annotation markers map to specific Action-column entries (cite an actual step, e.g., \"Step 1's '\u2465 low=mid+1' is line \u2465 firing because arr[3]=12 < 23\").\n2. TRACE \u2192 CURVE: state the concrete step count from the trace (e.g., \"2 steps for n=8 \u2264 log\u20828\") and explain why that step count IS the plotted complexity class \u2014 each trace step = one unit the curve counts.\n3. WHY IT MATTERS: tie the code\u2192state\u2192scaling progression back to the reader and goal in one sentence.\n\n=== GEOMETRY (historically ALWAYS overflows/clips \u2014 be aggressive) ===\nEvery prior attempt clipped. Be ruthless about compactness:\n- Code \u226412 lines, trace \u22646 rows / \u22646 columns, ONE table only.\n- Captions \u22641 line each. No long prose blocks inside any pane.\n- The Vega-Lite spec: 5 data points per series, no extra config.\n- Keep the entire plan tight; trim anything redundant.\n\n=== OUTPUT FORMAT ===\nConcise plain text. Headed sections: \"Pane 1\", \"Pane 2\", \"Pane 3\", then \"How the panes connect\". For each pane state its visual treatment (bg hex + border + font) and bold title. Pane 3 must contain a complete inline Vega-Lite JSON spec.\n\nBefore finishing, VERIFY and note:\n- Pane 3 JSON has title + x-axis title + y-axis title + inline numeric data for BOTH series.\n- Trace is step-ordered ascending, \u22646 rows, ONE table only.\n- The three panes use three DIFFERENT bg colors, border colors, and fonts.\n- The connection section cites specific steps/markers and explains the trace\u2192complexity link.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_algorithm-walkthrough/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_algorithm-walkthrough/report.md new file mode 100644 index 0000000..c143eff --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_algorithm-walkthrough/report.md @@ -0,0 +1,5 @@ +# GEPA journey: algorithm-walkthrough + +rubric criteria: 7; val: 2 + +**Seed 0.5258 → Best 0.6278 (+0.1020)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_architecture-zones/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_architecture-zones/best_prompts.json new file mode 100644 index 0000000..20970e8 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_architecture-zones/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'architecture-zones' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Nodes are clustered into clearly labeled horizontal zone bands (e.g. Edge / Frontend / Services / Data) stacked top-to-bottom, not a flat scatter\n- The overall flow direction reads top-to-bottom, with edges connecting only adjacent tiers (no edge visibly skips over a middle band)\n- Each zone band has its own background tint/color distinguishing it from neighboring bands\n- Nodes are color-coded by role/status (not all plain grey) with each node showing a label plus a secondary sub line\n- Edge arrows are colored (not all grey) and visually distinguishable by purpose\n- A legend/key panel maps colors to their meaning\n- No node boxes overlap and no edge line passes through an unrelated node box\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_architecture-zones/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_architecture-zones/report.md new file mode 100644 index 0000000..4bc6ec6 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_architecture-zones/report.md @@ -0,0 +1,5 @@ +# GEPA journey: architecture-zones + +rubric criteria: 7; val: 2 + +**Seed 0.4002 → Best 0.4002 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_before-after/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_before-after/best_prompts.json new file mode 100644 index 0000000..a67dc50 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_before-after/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'before-after' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- A two-column layout shows a Before card and an After card side by side\n- Each card contains a table of the same metrics for comparison\n- The After card includes a delta (\u0394) column showing the change\n- A headline badge shifts color (e.g. grey \u2192 green) to signal improvement\n- The cards are clearly labeled 'Before' and 'After'\n- Metric rows align across both cards so values are directly comparable\n- The direction of improvement is visually clear (color or sign on deltas)\n\nBeyond the rubric, your plan is ALSO scored on COMPREHENSION \u2014 how easily a reader\nunderstands the board at a glance and grasps the meaning of the change. Maximize this by\nadding interpretive context, not just raw numbers. Concretely:\n\n1. STRUCTURE & LAYOUT\n - Two equal-width cards side by side. Left = \"BEFORE\", right = \"AFTER\".\n - Identical metric set in the same row order in both cards, identical row heights,\n so each metric sits on one shared horizontal track for direct eye-scan comparison.\n - Before card: 2 columns (Metric | Value). After card: 3 columns (Metric | Value | \u0394).\n\n2. HEADLINE BADGE (color shift) \u2014 make it span/connect both cards\n - Before-state badge: grey background (#9E9E9E or #9CA3AF), text naming the baseline\n state plus a concrete identifier (e.g. \"Baseline build v2.3\", a date, or a baseline value).\n - After-state badge: green background (#2E7D32 or #16A34A), text naming the improved state\n PLUS the single headline win figure (e.g. \"Refactored build v3.0 \u2014 41% faster p95\").\n - Describe a visible transition: grey badge \u2192 arrow \u2192 green badge, signalling improvement.\n\n3. CONTENT \u2014 use SPECIFIC, REALISTIC values (generic plans fail)\n - Invent concrete, plausible numbers for every metric in the use case.\n - Label each card with the state identifier AND its measurement context (build/version,\n date, or measurement window), e.g. \"BEFORE (build v2.3, measured 2024-05-01)\".\n\n4. DELTA COLUMN (After card)\n - Every \u0394 shows: directional arrow + explicit sign + absolute change + percentage,\n e.g. \"\u25bc \u2212195 ms (\u221241%)\" or \"\u25b2 +720 req/s (+60%)\".\n - Use \u25bc where lower is better (latency, cost, errors, size, memory, time) and\n \u25b2 where higher is better (throughput, score, conversions).\n - Color every improving \u0394 green; state explicitly that any worsening \u0394 would render\n red with +/\u2212 sign and the opposite arrow.\n\n5. DIRECTION-OF-IMPROVEMENT \u2014 make it UNAMBIGUOUS and self-explanatory\n - Include an explicit rule note (small footer) stating which direction is good per metric,\n e.g. \"Green = improvement. Lower is better for latency/errors/memory; higher is better\n for throughput.\"\n - This interpretive legend is the biggest comprehension booster \u2014 always include it.\n\n6. FOOTER / CAPTION (boosts comprehension)\n - Add a caption giving measurement conditions (e.g. \"load test, 10k concurrent users,\n 30 min steady-state\" or \"measured over 30-day periods, Mar vs May 2024\") AND a one-line\n interpretation of what the change means / why it is the win.\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure)\nthat satisfies each criterion. Be concrete; generic plans fail. End with a short rubric-coverage\nchecklist confirming each criterion is met.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_before-after/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_before-after/report.md new file mode 100644 index 0000000..403bdd6 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_before-after/report.md @@ -0,0 +1,5 @@ +# GEPA journey: before-after + +rubric criteria: 7; val: 2 + +**Seed 0.7198 → Best 0.8756 (+0.1557)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_build-pipeline/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_build-pipeline/best_prompts.json new file mode 100644 index 0000000..b23f359 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_build-pipeline/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'build-pipeline' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nOutput a concise plain-text plan naming SPECIFIC, concrete content (real labels, real\ndurations, real step details, real coordinates, real hex colors). Generic plans fail.\nThe plan is judged on comprehension, visual quality, geometry, and a strict rubric.\nFollow the EXACT output format below \u2014 the grader parses it literally, so do not rename\nfields, drop colors, or use synonyms.\n\n== CANONICAL STATUS VOCABULARY (use these words VERBATIM, uppercase) ==\nPASSED \u2192 green #2E7D32\nRUNNING \u2192 amber #F9A825\nFAILED \u2192 red #C62828\nBLOCKED \u2192 grey #9E9E9E (use the word BLOCKED, never \"Skipped\" or \"Pending\" as the status word)\n\n== STATUS ASSIGNMENT RULES ==\n- Identify the stage's actual state from the use case.\n- If a stage FAILED: that stage = FAILED #C62828. Every stage AFTER it = BLOCKED #9E9E9E.\n- If a stage is RUNNING: that stage = RUNNING #F9A825. Every stage AFTER it = BLOCKED #9E9E9E.\n- Every stage BEFORE a failed/running stage = PASSED #2E7D32.\n- If ALL stages pass: every stage = PASSED #2E7D32 (no grey, no red, no amber).\n- There is at most ONE failed OR one running stage; everything before is green, everything after is grey.\n\n== HARD RUBRIC REQUIREMENTS (satisfy EVERY item) ==\n1. STAGES AS NODES: Every pipeline stage named in the use case is its own node, in execution\n order. Never merge or drop stages.\n2. STATUS COLORING: Each node line MUST contain, in this order, the canonical STATUS word\n AND the literal token \"fill\" AND the exact hex. Format: \"<Stage> @ (x,y) \u2014 <STATUS> fill <hex>\".\n Use ONLY the four exact hex codes above.\n3. EXECUTION-ORDER EDGES: Directed left-to-right edges with \u25ba joining ONLY adjacent stages.\n4. META/SUB LINE: Every node has a meta line with a concrete DURATION and a concrete STEP\n DETAIL (tool, commit hash, counts, artifact). Make them realistic and specific.\n5. FAILURE VISIBILITY (only if a stage actually failed):\n - The failed node is FAILED #C62828 and its meta MUST begin with \"FAILED:\" then a concrete reason.\n - EVERY downstream node is BLOCKED #9E9E9E with meta exactly: \"Skipped \u2014 blocked by <failed stage>\".\n - The edge entering each blocked stage is dashed grey (#9E9E9E); note this explicitly.\n6. RUNNING VISIBILITY (only if a stage is running, no failure):\n - The running node is RUNNING #F9A825; meta begins with \"running <elapsed> \u00b7\" then detail.\n - EVERY downstream node is BLOCKED #9E9E9E with meta exactly: \"Skipped \u2014 blocked by <running stage>\".\n - The edge entering each blocked stage is dashed grey (#9E9E9E).\n7. LEGEND: ALWAYS print the legend with ALL FOUR core entries in this exact line, regardless\n of which statuses appear on the board:\n \ud83d\udfe9 #2E7D32 Passed \u00b7 \ud83d\udfe7 #F9A825 Running \u00b7 \ud83d\udfe5 #C62828 Failed \u00b7 \u2b1c #9E9E9E Blocked\n Give the legend its own (x,y) clear of all nodes and edges.\n8. NO OVERLAP / CLEAN EDGES: No node overlaps another; no edge passes through an unrelated node.\n\n== GEOMETRY (strict \u2014 prior plans lost the most here) ==\nCanvas is fixed 1000 wide \u00d7 600 tall, origin (0,0) top-left. EVERYTHING must fit with margin;\nno coordinate may be negative or exceed the canvas, and no node's right edge may exceed 960.\n- All stage nodes in ONE horizontal row at y=260, node height 80.\n- Choose node width W and center-to-center gap by node count N:\n N \u2264 5: W = 150. x positions = 40, 230, 420, 610, 800 (right edge of last = 950 \u2264 960). OK.\n N = 6: W = 120. gap = floor((1000 - 80 - 120)/(6-1)) = 160.\n x = 40, 200, 360, 520, 680, 840 (right edge 960). OK.\n N = 7: W = 110. gap = floor((1000 - 80 - 110)/(7-1)) = 135.\n x = 40, 175, 310, 445, 580, 715, 850 (right edge 850+110 = 960). OK.\n N = 8: W = 95. gap = floor((1000 - 80 - 95)/(8-1)) = 117.\n x = 40, 157, 274, 391, 508, 625, 742, 859 (right edge 954). OK.\n General rule: leftmost x = 40; gap = floor((1000 - 80 - W) / (N-1)); shrink W until the\n rightmost node's right edge (last_x + W) \u2264 960. NEVER let a node go off-canvas. Verify the\n last node arithmetically before writing it.\n- State the explicit (x,y) of every node.\n- Edges run straight along the row centerline (y=300) between adjacent node edges only.\n- Place the legend box at a free coordinate clear of nodes (which occupy y=260\u2013340):\n use bottom-left (x=40, y=520) or top-right (x=780, y=20). Confirm it overlaps nothing.\n\n== OUTPUT FORMAT (keep tight; comprehension is scored) ==\nBoard Title: \"<specific run name / id / commit>\"\nCanvas: 1000\u00d7600\n\nNodes (execution order, with coordinates):\n1. <Stage> @ (x,y) \u2014 <STATUS> fill <hex>\n meta: \"<duration> \u00b7 <concrete step detail>\"\n... (one per stage; downstream-of-failure/running stages are BLOCKED with the exact meta above)\n\nEdges (left-to-right, \u25ba):\n<A> \u25ba <B> \u25ba <C> ... (explicitly note which edges are dashed grey into BLOCKED stages)\n\nLegend @ (x,y):\n\ud83d\udfe9 #2E7D32 Passed \u00b7 \ud83d\udfe7 #F9A825 Running \u00b7 \ud83d\udfe5 #C62828 Failed \u00b7 \u2b1c #9E9E9E Blocked\n\nReader Takeaway: one sentence on what the reader sees at a glance toward the goal \u2014 name the\nspecific colored stage(s) that matter (e.g. which stage is red/amber and what it means).\n\nReflect the ACTUAL status from the use case. Do NOT invent a failure if all stages pass, and\ndo NOT add a separate \"failure example\" sub-board. Only include BLOCKED (grey) nodes when a\nreal stage is downstream of a real failed or running stage.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_build-pipeline/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_build-pipeline/report.md new file mode 100644 index 0000000..256f446 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_build-pipeline/report.md @@ -0,0 +1,5 @@ +# GEPA journey: build-pipeline + +rubric criteria: 7; val: 2 + +**Seed 0.5642 → Best 0.6944 (+0.1302)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_c4-architecture/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_c4-architecture/best_prompts.json new file mode 100644 index 0000000..347624d --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_c4-architecture/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You produce a plan for a \"c4-architecture\" board. Your output is parsed by a renderer that draws\nthree vertically stacked panes corresponding to the three C4 zoom levels (top = least zoomed).\nYou are scored on: (a) machine-parseability of rows, nodes, and labeled edges; (b) comprehension\n(the three levels read as a coherent zoom-in); and (c) geometry (content must fit without\noverflowing or clipping).\n\nIMPORTANT CONTEXT FROM PRIOR FAILURES\nPrevious attempts used a terse \"N|name|type\" / \"E|src|dst|interaction|protocol\" scheme. Despite\nbeing well-formed, that scheme scored 0.0 on EVERY structural rubric (stacked_rows,\ncontainer_level, component_level, edges_labeled), 0.0 comprehension, and the canvas overflowed.\nConclusion: the parser did NOT recognize that terse scheme, AND there was too much content.\nTherefore you must (1) emit an explicit, self-describing, key-labeled structure the parser can\nsegment unambiguously, and (2) be extremely sparse so nothing clips.\n\nINPUTS\n- Use case: {topic}\n- Reader: {audience}\n- Goal: {purpose}\n\nWHAT TO MODEL (C4 levels, top to bottom = increasing zoom)\n- LEVEL 1 SYSTEM CONTEXT: the system as a single this-system box, plus the person(s) and external\n system(s) it talks to. No internals.\n- LEVEL 2 CONTAINER: zoom inside the system box. Show named apps/services as containers and data\n stores as datastores. Carry over a person and/or external ONLY if an edge needs it.\n- LEVEL 3 COMPONENT: zoom inside ONE container named in LEVEL 2. Show its internal building blocks\n (controllers/services/adapters/repositories) as components. Carry over a datastore or external\n ONLY if an edge references it.\n\nOUTPUT FORMAT \u2014 emit EXACTLY the structure below and NOTHING else. No markdown, no prose, no\ntitles, no notes, no checklists, no emoji, no extra blank lines. Use these literal section headers\nand these literal field keys, one item per line, fields separated by \" | \" (space pipe space).\n\nEach row is introduced by a header line. Under each header, list NODE lines first, then EDGE lines.\n- NODE line: NODE | id=<name> | type=<type> | row=<rownum>\n- EDGE line: EDGE | from=<name> | to=<name> | label=<interaction> | protocol=<protocol> | row=<rownum>\n\nEmit in EXACTLY this order:\n\nLEGEND | Person=blue | This System=teal | External=gray\nROW 1 | SYSTEM CONTEXT\nNODE | id=<name> | type=<type> | row=1\nNODE | id=<name> | type=<type> | row=1\nEDGE | from=<name> | to=<name> | label=<interaction> | protocol=<protocol> | row=1\nROW 2 | CONTAINER\nNODE | id=<name> | type=<type> | row=2\nNODE | id=<name> | type=<type> | row=2\nEDGE | from=<name> | to=<name> | label=<interaction> | protocol=<protocol> | row=2\nROW 3 | COMPONENT\nNODE | id=<name> | type=<type> | row=3\nNODE | id=<name> | type=<type> | row=3\nEDGE | from=<name> | to=<name> | label=<interaction> | protocol=<protocol> | row=3\n\nLINE RULES\n- Every node line begins literally with \"NODE | \" and contains exactly the keys id=, type=, row=.\n- Every edge line begins literally with \"EDGE | \" and contains exactly the keys from=, to=,\n label=, protocol=, row=.\n- The three row headers must appear literally and in order: \"ROW 1 | SYSTEM CONTEXT\",\n \"ROW 2 | CONTAINER\", \"ROW 3 | COMPONENT\".\n- The LEGEND line must be emitted first, exactly as written.\n- Never use the pipe character or the \"=\" character inside a field value.\n- The row= value on every node/edge must match the section it appears under (1, 2, or 3).\n- from= and to= on an edge must exactly match an id= of a node listed in that SAME row.\n\nTYPE VALUES (use exactly one, lowercase)\nperson | this-system | container | datastore | component | external\n\nPROTOCOL VALUES (concrete, never blank)\nHTTPS | JSON/HTTPS | REST/OAuth2 | JDBC/SQL | XML/HTTPS | SMTP | gRPC | AMQP | method call | in-process\n- Use \"method call\" or \"in-process\" for ROW 3 component-to-component edges.\n- Use a concrete network/db protocol for cross-container and external edges.\n\nSIZE LIMITS (critical \u2014 overflow zeroes the geometry score; keep it SMALL)\n- ROW 1: 2 to 3 nodes, 1 to 2 edges.\n- ROW 2: 3 to 4 nodes, 2 to 4 edges.\n- ROW 3: 3 to 4 nodes, 2 to 4 edges.\n- NEVER exceed 4 nodes in any row. Count carried-over anchor nodes toward the cap.\n- Node ids: 1 to 3 words. Label phrases: 2 to 4 words max.\n- Keep edge count at or below node count in each row.\n\nCONTENT RULES\n- Use specific, real names/technologies from the use case (e.g. \"Stripe\", \"PostgreSQL\",\n \"React SPA\", \"Spring API\", \"Salesforce\", \"Mainframe\"), never generic placeholders.\n- Keep names consistent across rows so the zoom continuity is clear: the container expanded in\n ROW 3 must be one of the containers named in ROW 2; any datastore/external carried into ROW 3\n must reuse the EXACT id used in ROW 2.\n- Pick exactly ONE container to expand in ROW 3 (prefer the one the use case calls out, e.g.\n \"components inside the API\" -> expand the API container; \"components inside ingestion\" ->\n expand the Ingestion container).\n- Output ONLY the LEGEND line, the three row headers, and their NODE/EDGE lines. Nothing else.\n\nEXAMPLE (illustrative shape only; adapt content to the actual use case)\nLEGEND | Person=blue | This System=teal | External=gray\nROW 1 | SYSTEM CONTEXT\nNODE | id=Customer | type=person | row=1\nNODE | id=Internet Banking | type=this-system | row=1\nNODE | id=Mainframe | type=external | row=1\nEDGE | from=Customer | to=Internet Banking | label=views accounts | protocol=HTTPS | row=1\nEDGE | from=Internet Banking | to=Mainframe | label=gets accounts | protocol=XML/HTTPS | row=1\nROW 2 | CONTAINER\nNODE | id=Web App | type=container | row=2\nNODE | id=API App | type=container | row=2\nNODE | id=Database | type=datastore | row=2\nNODE | id=Mainframe | type=external | row=2\nEDGE | from=Web App | to=API App | label=calls | protocol=JSON/HTTPS | row=2\nEDGE | from=API App | to=Database | label=reads writes | protocol=JDBC/SQL | row=2\nEDGE | from=API App | to=Mainframe | label=fetches data | protocol=XML/HTTPS | row=2\nROW 3 | COMPONENT\nNODE | id=Accounts Controller | type=component | row=3\nNODE | id=Mainframe Adapter | type=component | row=3\nNODE | id=Database | type=datastore | row=3\nEDGE | from=Accounts Controller | to=Mainframe Adapter | label=requests data | protocol=method call | row=3\nEDGE | from=Mainframe Adapter | to=Database | label=caches data | protocol=JDBC/SQL | row=3", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_c4-architecture/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_c4-architecture/report.md new file mode 100644 index 0000000..14625eb --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_c4-architecture/report.md @@ -0,0 +1,5 @@ +# GEPA journey: c4-architecture + +rubric criteria: 7; val: 2 + +**Seed 0.1444 → Best 0.1744 (+0.0300)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_calendar-heatmap/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_calendar-heatmap/best_prompts.json new file mode 100644 index 0000000..47080fe --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_calendar-heatmap/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'calendar-heatmap' board and output it as JSON.\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nCRITICAL OUTPUT REQUIREMENT:\nYou MUST output a single valid JSON object and NOTHING else \u2014 no markdown, no\nprose, no code fences, no headings. Any non-JSON output scores 0.00. The JSON\nmust fully encode the board so it could be rendered directly.\n\nUse exactly this schema (fill every field with SPECIFIC, real values \u2014 generic\nplans fail):\n\n{\n \"type\": \"calendar-heatmap\",\n \"title\": \"<short title naming the activity, e.g. 'Daily Commit Activity'>\",\n \"subtitle\": \"<names the activity AND the exact period + total, e.g. 'Apr 7 \u2013 Oct 5, 2025 \u00b7 26 weeks \u00b7 412 total commits'>\",\n \"axes\": {\n \"x\": { \"field\": \"week_index\", \"label\": \"Week\", \"domain\": \"<oldest\u2192newest week range, one column per calendar week>\" },\n \"y\": { \"field\": \"day_of_week\", \"label\": \"Day\", \"order\": [\"Mon\",\"Tue\",\"Wed\",\"Thu\",\"Fri\",\"Sat\",\"Sun\"] }\n },\n \"grid\": {\n \"columns\": <integer number of week columns>,\n \"rows\": 7,\n \"total_cells\": <columns * 7>,\n \"week_starts_on\": \"Mon\"\n },\n \"cell\": {\n \"shape\": \"rounded-rect\",\n \"width_px\": 13,\n \"height_px\": 13,\n \"gap_px\": 3,\n \"out_of_range\": \"transparent\",\n \"note\": \"every cell is a discrete rectangle separated by the gap on BOTH axes\"\n },\n \"color_scale\": {\n \"type\": \"sequential-single-hue\",\n \"encodes\": \"<the count metric, e.g. commits per day>\",\n \"zero_is_palest\": true,\n \"buckets\": [\n { \"range\": \"0\", \"min\": 0, \"max\": 0, \"color\": \"#ebedf0\" },\n { \"range\": \"1\u20132\", \"min\": 1, \"max\": 2, \"color\": \"#9be9a8\" },\n { \"range\": \"3\u20135\", \"min\": 3, \"max\": 5, \"color\": \"#40c463\" },\n { \"range\": \"6\u20139\", \"min\": 6, \"max\": 9, \"color\": \"#30a14e\" },\n { \"range\": \"10+\", \"min\": 10,\"max\": null,\"color\": \"#216e39\" }\n ]\n },\n \"legend\": {\n \"orientation\": \"horizontal\",\n \"position\": \"bottom-right\",\n \"prefix\": \"Less\",\n \"suffix\": \"More\",\n \"caption\": \"<metric per day>\",\n \"swatches\": [ { \"label\": \"0\", \"color\": \"#ebedf0\" }, ... one per bucket, in order ... ]\n },\n \"tooltip\": {\n \"format\": \"<count> <unit> on <Weekday, Month D, YYYY>\",\n \"example\": \"5 commits on Tuesday, Jun 17, 2025\",\n \"zero_example\": \"No commits on Sunday, May 4, 2025\",\n \"note\": \"MUST include the FULL date: weekday, full month name, day, and 4-digit year\"\n },\n \"axis_label_thinning\": {\n \"x\": \"<month name shown only at the first week-column of each month, e.g. ['Apr','May','Jun','Jul','Aug','Sep','Oct'] \u2014 NOT every week>\",\n \"y\": [\"Mon\",\"Wed\",\"Fri\"]\n },\n \"sample_data\": [\n { \"date\": \"<YYYY-MM-DD>\", \"weekday\": \"<Mon..Sun>\", \"week_index\": <int>, \"count\": <int>, \"color\": \"<bucket color>\" }\n // include ~8\u201315 concrete cells that illustrate the reader's goal\n // (e.g. weekday streaks, vacation/zero gaps, spike days)\n ],\n \"reader_takeaway\": \"<one sentence: how this layout serves the stated goal>\"\n}\n\nRUBRIC REQUIREMENTS \u2014 every one must be satisfied in the JSON:\n1. Grid: week index on the x-axis (one column per calendar week), day-of-week\n Mon\u2192Sun top-to-bottom on the y-axis. rows = 7.\n2. Cells: discrete rectangles with a small gap (use gap_px \u2265 2) on both axes.\n3. Color: sequential single-hue scale; the 0 bucket MUST be the palest color and\n darkness increases monotonically with count.\n4. title and subtitle: subtitle must name the activity AND the exact period\n (start date \u2013 end date, number of weeks, and a total).\n5. Tooltip: MUST contain the FULL date \u2014 weekday name, full month name, numeric\n day, and 4-digit year \u2014 PLUS the count (this was the weakest scored item, so\n be explicit and correct).\n6. Axis label thinning: x-axis shows month labels only where a new month starts\n (~one per month, not per week); y-axis labels only Mon/Wed/Fri.\n7. Legend: maps each shade to its count range, in order, with Less\u2192More framing.\n\nDOMAIN GUIDANCE:\n- Choose a real period consistent with the use case (e.g. \"last 26 weeks\" \u2192 ~26\n columns; \"last quarter\" \u2192 ~14 weeks; \"last 6 months\" \u2192 ~26 weeks).\n- Pick a sequential palette appropriate to the metric (GitHub greens #ebedf0\u2192\n #216e39 for activity; single-hue blues #f0f5ff\u2192#08519c for volume). Always\n list 4\u20136 buckets with concrete hex colors and numeric ranges that fit the\n plausible data range for the use case.\n- Make sample_data concretely demonstrate the reader's goal (streaks, gaps,\n spikes, weekly patterns) with real dates and counts.\n\nOutput ONLY the JSON object.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_calendar-heatmap/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_calendar-heatmap/report.md new file mode 100644 index 0000000..0ae083c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_calendar-heatmap/report.md @@ -0,0 +1,5 @@ +# GEPA journey: calendar-heatmap + +rubric criteria: 7; val: 2 + +**Seed 0.3231 → Best 0.5729 (+0.2498)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_call-hierarchy/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_call-hierarchy/best_prompts.json new file mode 100644 index 0000000..2877a37 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_call-hierarchy/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'call-hierarchy' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- A root function is at the top of the hierarchy\n- Called functions are nested as children under their callers\n- Each function node shows its file/location\n- If a before/after diff, nodes are colored by change (added/removed/modified/unchanged)\n- Changed nodes carry a note explaining the change\n- Relevant nodes show meta info (e.g. timing or call count)\n- The hierarchy reads cleanly as a tree (clear parent-child nesting, no tangle)\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_call-hierarchy/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_call-hierarchy/report.md new file mode 100644 index 0000000..5d6564f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_call-hierarchy/report.md @@ -0,0 +1,5 @@ +# GEPA journey: call-hierarchy + +rubric criteria: 7; val: 2 + +**Seed 0.7167 → Best 0.7167 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_class-diagram/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_class-diagram/best_prompts.json new file mode 100644 index 0000000..b4212c4 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_class-diagram/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'class-diagram' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nOutput a concise plain-text plan naming the SPECIFIC content (real class names, real attribute/method values with types, concrete edge labels) that satisfies every rubric criterion below. Generic plans fail \u2014 be concrete.\n\nRUBRIC \u2014 a great board MUST satisfy EVERY criterion:\n- Each class is a titled box with an attribute compartment (rows) and a method() compartment (rows), separated by a divider.\n- Method rows are visibly distinguished from attributes via trailing parentheses and a return type, e.g. `processPayment(): Receipt`. Attribute rows use `name: type` with NO parentheses.\n- Inheritance is drawn BT (parents ABOVE children); generalization arrowheads (hollow triangle \u25b7) point UP toward the parent.\n- Edges connect classes with correct arrowheads and are LABELED where appropriate.\n- Class boxes do not overlap; edges do not cross through unrelated boxes.\n- The parent-child hierarchy reads as a clear tree at a glance.\n- Every class connects to at least one other (no orphan class).\n\nCRITICAL LESSONS FROM PAST LOW SCORES \u2014 apply all of these:\n\n1. RELATIONSHIP_EDGES was repeatedly scored 0.5. A diagram with ONLY inheritance edges is too poor. You MUST include a VARIETY of relationship types with distinct, correct UML notation, and label them:\n - Generalization (inheritance): hollow triangle \u25b7 pointing at parent, no label.\n - Association: solid line, open arrow \u2192, labeled with role + multiplicity, e.g. `uses 1`, `owns 0..*`.\n - Aggregation: hollow diamond \u25c7 at the whole/container end, e.g. `Container \u25c7\u2014 Widget : contains 0..*`.\n - Composition: filled diamond \u25c6 at the owner end (lifecycle dependency), with multiplicity.\n - Dependency: dashed line with open arrow \u21e2 (e.g. a class that returns or accepts another as a parameter).\n Introduce at least one NON-inheritance relationship (aggregation/composition/association/dependency) between real classes in the domain \u2014 e.g. a base class that holds, produces, or uses another class. Give every edge an explicit multiplicity (1, 0..*, 1..*) and a role label where it is an association/aggregation/composition.\n\n2. ALL_CONNECTED was scored 0.5 when a note/callout was the only thing tying boxes together, OR when a helper class had no edge. Do NOT rely on dashed note boxes to satisfy connectivity. Every CLASS must have at least one real structural edge (generalization OR association/aggregation/composition/dependency) to another CLASS. Add a supporting/collateral class from the domain (e.g. a Receipt, Event, Style, Layout, Theme, Renderer, Repository) and connect it via a non-inheritance edge so the diagram is richer and fully connected through class-to-class relationships, not just a star of inheritance.\n\n3. COMPREHENSION was the weakest dimension (0.43\u20130.50). The plan must actively SERVE the reader's goal, not just list classes:\n - Tie content to the stated Goal. If the goal is \"know which methods to override,\" mark which methods are abstract in the parent (italic \u00ababstract\u00bb) and which are overridden in children (mark `(override)`), and make the overridden set obvious.\n - Show meaningful DIFFERENTIATION between siblings: give each subclass distinct attributes AND at least one subclass-specific method (not just the same three overrides repeated). This demonstrates real polymorphism/extension points and makes the diagram informative.\n - Use UML stereotypes where they aid understanding: \u00ababstract\u00bb, \u00abinterface\u00bb, \u00abenumeration\u00bb. If an enum or interface is referenced (e.g. a Status type), make it a real box and connect it.\n - Add visibility markers (`+` public, `-` private, `#` protected) on every member row \u2014 this is standard UML and improves clarity.\n\n4. STRUCTURE: prefer a slightly DEEPER or RICHER tree over a flat one-parent/three-leaf star when the domain allows (e.g. an intermediate abstract subclass, or a shared interface implemented by several classes). A purely flat fan-out is what scored lowest; even one extra level or one cross-cutting relationship raises comprehension.\n\nREQUIRED OUTPUT SECTIONS:\n1. Layout & Orientation \u2014 ASCII sketch showing BT placement (parent top, children below), all class boxes positioned with even horizontal spacing so inheritance edges fan out without crossing; show where non-inheritance edges route around boxes.\n2. Class Boxes \u2014 for EACH class: title (with stereotype if any), attribute compartment (`+/-/# name: type`), method compartment (`+/-/# method(args): returnType`, marked \u00ababstract\u00bb or (override) as relevant). Give siblings distinct members.\n3. Edges \u2014 a table: From | To | Relationship type | Arrowhead/notation | Label (role + multiplicity). Include at least one inheritance set AND at least one association/aggregation/composition/dependency. Confirm arrowheads point UP for inheritance.\n4. Member Distinction note \u2014 confirm methods carry `()`+return type, attributes do not, divider separates compartments.\n5. Connectivity & Readability check \u2014 list each class and the real class-to-class edge that connects it (no orphans, no note-only connections); confirm no overlaps and no edges through unrelated boxes; confirm the tree reads clearly.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_class-diagram/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_class-diagram/report.md new file mode 100644 index 0000000..ab1e307 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_class-diagram/report.md @@ -0,0 +1,5 @@ +# GEPA journey: class-diagram + +rubric criteria: 7; val: 2 + +**Seed 0.6298 → Best 0.7146 (+0.0848)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_complexity-growth/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_complexity-growth/best_prompts.json new file mode 100644 index 0000000..4ad8589 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_complexity-growth/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'complexity-growth' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Multiple growth classes/implementations are plotted as distinct labeled lines over input size\n- The y-axis uses a log/symlog scale so all growth classes are visible together\n- Data points are marked on the lines (point:true)\n- Both axes have titles (e.g. input size n / operations)\n- A legend identifies each growth class/series\n- The chart has a descriptive title and subtitle\n- Hovering shows tooltip values (tooltip enabled)\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_complexity-growth/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_complexity-growth/report.md new file mode 100644 index 0000000..7f60d37 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_complexity-growth/report.md @@ -0,0 +1,5 @@ +# GEPA journey: complexity-growth + +rubric criteria: 7; val: 2 + +**Seed 0.3905 → Best 0.3905 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_correlation-heatmap/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_correlation-heatmap/best_prompts.json new file mode 100644 index 0000000..641ffb1 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_correlation-heatmap/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'correlation-heatmap' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- The chart is a square N\u00d7N matrix with identical sorting on both axes\n- The numeric value is printed inside each cell (text layer over rect)\n- Cell background color encodes the value (diverging scale for correlation, sequential for latency)\n- Cell text stays legible on every cell (light text over dark/hot cells, dark over light)\n- Both axes are titled naming the two dimensions\n- The chart has a title and subtitle\n- Tooltip shows both keys and the value for a cell\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_correlation-heatmap/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_correlation-heatmap/report.md new file mode 100644 index 0000000..57443ea --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_correlation-heatmap/report.md @@ -0,0 +1,5 @@ +# GEPA journey: correlation-heatmap + +rubric criteria: 7; val: 2 + +**Seed 0.8379 → Best 0.8379 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_critical-path/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_critical-path/best_prompts.json new file mode 100644 index 0000000..113f045 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_critical-path/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'critical-path' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Tasks are laid out left-to-right reflecting the time/schedule axis\n- The critical path tasks are highlighted red with thick edges and a 'bolt'-style icon (not read as 'failed')\n- Tasks with slack are shown neutral/grey with dashed edges, distinct from the critical chain\n- Each task shows its duration (e.g. '6d') in a meta line\n- Each task shows its owner (and optionally slack) in a sub line\n- A two-row legend distinguishes critical path (red) from has-slack (grey-dash)\n- No task nodes overlap and edges do not pass through unrelated nodes\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_critical-path/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_critical-path/report.md new file mode 100644 index 0000000..ddddc8f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_critical-path/report.md @@ -0,0 +1,5 @@ +# GEPA journey: critical-path + +rubric criteria: 7; val: 2 + +**Seed 0.2059 → Best 0.2059 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_data-lineage/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_data-lineage/best_prompts.json new file mode 100644 index 0000000..c0a4ecb --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_data-lineage/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'data-lineage' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Nodes are grouped into labeled tier zones (Sources / Staging / Marts / Consumers)\n- Data flows in one consistent direction from sources toward consumers\n- Datasets show a freshness/SLA indicator in a meta line\n- At least one stale-past-SLA edge is shown dashed/red distinct from fresh green edges\n- Fresh data-flow edges are colored green\n- A legend distinguishes fresh vs stale edges\n- No nodes overlap and edges do not pass through unrelated nodes\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "You are generating a data-lineage board as a single valid JSON object for a graph/diagram renderer. Output ONLY the JSON \u2014 no prose, no code fences.\n\n## Task\nGiven a plan describing a data-lineage board (data flowing through tiers: SOURCES \u2192 STAGING \u2192 MARTS \u2192 CONSUMERS), produce a JSON diagram that bakes in the SPECIFIC node names, freshness/SLA metadata, statuses, and edges named in the plan. Use the exact values from the plan \u2014 never placeholders.\n\n## Output JSON shape\n{\n \"direction\": \"LR\",\n \"groups\": [ { \"id\", \"label\", \"color\" }, ... ],\n \"nodes\": [ { \"id\", \"group\", \"position\": {\"x\",\"y\"}, \"width\", \"height\", \"data\": { \"label\", \"status\" }, \"style\": {...} }, ... ],\n \"edges\": [ { \"source\", \"target\", \"animated\", \"style\": {...}, \"data\": { \"label\" } }, ... ]\n}\n\n## CRITICAL: how each rubric criterion is satisfied\n\n### 1. fresh_edges_green / stale edges colored (this ALWAYS failed before)\nEdge color is NOT conveyed by the label text. You MUST set explicit color via the edge `style` object. For EVERY edge:\n- FRESH edges (within SLA): `\"style\": { \"stroke\": \"#22c55e\", \"strokeWidth\": 2 }` and `\"animated\": false`. Do NOT set strokeDasharray (solid line). Label like \"fresh \u2014 within SLA\".\n- STALE edges (past SLA): `\"style\": { \"stroke\": \"#ef4444\", \"strokeWidth\": 2, \"strokeDasharray\": \"6 4\" }` and `\"animated\": true`. Label flags it, e.g. \"STALE \u2014 past SLA\".\nUse green (#22c55e) for solid fresh, red (#ef4444) dashed for stale. This is mandatory on every single edge.\n\n### 2. stale_path_flagged\nThe entire stale chain (root stale source \u2192 ... \u2192 affected consumer) must form an UNBROKEN set of red dashed edges. The root-cause source node label must explicitly mark it (e.g. \"\u2190 ROOT CAUSE\" / \"LATE FEED\") and its `status` must be \"warn\". Every node on the stale path gets `status: \"warn\"`. The affected consumer label must reference the upstream cause. Verify the red edges connect source\u2192staging\u2192mart\u2192consumer with no gaps.\n\n### 3. tier_zones\nDefine one group per tier in `groups` (sources, staging, marts, consumers), each with a distinct `label` and `color`. Assign EVERY data node to its correct `group`. To make zones read as distinct labeled columns, also place nodes so each tier occupies its own x-band (see positioning).\n\n### 4. legend_present\nInclude a dedicated legend node. Give it its own group (e.g. a \"legend\" group OR place it clearly), set `status: \"info\"`, and place it at a non-overlapping position (e.g. bottom-left, below all data nodes). Its label must explain: green solid = Fresh (within SLA), red dashed = Stale (past SLA), and the meta status icons (\u2705 FRESH / \u274c STALE / \u26a0\ufe0f AFFECTED).\n\n### 5. no_overlap AND on-canvas (geometry failed every time)\nYou MUST set explicit `position {x,y}`, `width`, and `height` on every node. Do not rely on auto-layout. Rules:\n- Canvas: keep ALL nodes within x \u2208 [0, 1280] and y \u2208 [0, 720]. Nothing may exceed these bounds (off-canvas was penalized repeatedly).\n- Node size: width 220, height 90 (labels are multi-line; give enough height).\n- Columns by tier: SOURCES x=20, STAGING x=290, MARTS x=560, CONSUMERS x=830. This gives a 270px column pitch (50px gap between 220-wide nodes). Keep the rightmost node's x+width \u2264 1280.\n- Stack nodes vertically within a column: first node y=20, then increment y by 120 per node (90 height + 30 gap). Keep max y+height \u2264 720; if a column has many nodes, reduce row pitch to 110 but never overlap.\n- Place the root stale source and its downstream stale path in the BOTTOM row of each column so the red dashed chain runs along the bottom without crossing fresh nodes.\n- Legend node: place at x=20, y near the bottom (e.g. y=600) in empty space, width 360, height 90, ensuring it does not overlap any data node or exceed canvas.\n\n## Node label & status conventions\n- `data.label`: include the node name, its type/system in parens, and the freshness/SLA/status meta line exactly as the plan specifies. Use \"\\n\" to put meta on a new line.\n- `data.status`: \"success\" for FRESH nodes, \"warn\" for STALE/AFFECTED nodes, \"info\" for the legend.\n\n## Direction\nAlways \"LR\" (left\u2192right). All edges point rightward (source tier index < target tier index). No backward edges.\n\n## Final self-check before output\n- Every edge has a `style` with stroke color (green solid for fresh, red dashed for stale) \u2014 not just a label.\n- A legend node exists with status \"info\".\n- Every node has position, width, height; no two nodes' rectangles overlap; all within [0,1280]\u00d7[0,720].\n- The stale path is a continuous red dashed chain from root source to affected consumer, with the root cause flagged.\n- All node names, meta values, and edges match the plan exactly.\nOutput ONLY the JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_data-lineage/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_data-lineage/report.md new file mode 100644 index 0000000..6a15343 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_data-lineage/report.md @@ -0,0 +1,5 @@ +# GEPA journey: data-lineage + +rubric criteria: 7; val: 2 + +**Seed 0.2054 → Best 0.3386 (+0.1332)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_debug-snapshot/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_debug-snapshot/best_prompts.json new file mode 100644 index 0000000..c1e5ee3 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_debug-snapshot/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'debug-snapshot' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are designing a visual board (panes laid out on a 2D canvas). It will be\nrendered, so SPACE IS LIMITED. Plan for a clean, non-overflowing layout \u2014 keep\nevery pane compact. Verbosity and oversized content are the #1 failure mode:\ncontent that overflows or clips is penalized hard. Favor short labels, few rows,\nshort strings, and minimal decoration.\n\n== TASK ==\nProduce a concise plain-text plan for a 3-pane board that captures ONE single\ninstant of a program failure. Name SPECIFIC, REAL content (concrete values,\nlabels, file:line, frame names). Generic plans fail.\n\n== LAYOUT (mandatory) ==\nThree panes stacked strictly TOP \u2192 BOTTOM:\n PANE 1 (top) = ERROR (alert)\n PANE 2 (middle) = STATE (variable table at failure point)\n PANE 3 (bottom) = CALL STACK (flow)\nError/state panes MUST sit above the call-stack pane. Label each pane explicitly\nand unambiguously (e.g. \"ERROR\", \"STATE @ failure\", \"CALL STACK (top-down)\").\n\n== PANE 1 \u2014 ERROR (alert component) ==\nA single red alert banner. Include: the exception/error type, the exact error\nmessage string, and the failing location (file:line). Keep to ~3 short lines.\nOptionally one timestamp/thread id. Do NOT add extra paragraphs.\n\n== PANE 2 \u2014 STATE (table) ==\nA compact table of the variable values AT the failure point. 3\u20135 rows max.\nColumns: Variable | Value (| Type optional). Use real values that directly\nexplain the failure (e.g. the null var, the bad index, the conflicting locks).\nMark the offending variable clearly (e.g. a \u26a0 on its row). At most ONE short\nnote line under the table. Keep cell text short to avoid wide-table overflow.\n\n== PANE 3 \u2014 CALL STACK (flow pane) \u2014 this is the most-failed criterion ==\nThis MUST read as a genuine FLOW: a vertical chain of connected frame nodes with\nEXPLICIT directional connectors between every adjacent pair. Requirements:\n - Order TOP-DOWN: the innermost/crash frame is at the TOP, the entry point\n (e.g. main) at the BOTTOM.\n - Show each frame as a small node: function name + file:line. Keep to 4\u20136\n frames; trim noise frames (runtime/dispatcher internals).\n - Put a visible directional connector between EVERY adjacent frame (e.g. a\n downward arrow \u2193 with a tiny \"calls\" label) so it unmistakably reads as a\n flow, not a list.\n - HIGHLIGHT EXACTLY ONE culprit frame so it is visually unmistakable: give it\n a distinct marker AND distinct styling (e.g. \u2605 + bold red fill/border) and a\n SHORT inline \"CULPRIT\" tag. All other frames stay plain/neutral. Make the\n contrast obvious. Do not highlight more than one frame. Do not bury the\n highlight inside large ASCII boxes \u2014 keep frame boxes small so the highlight\n is the focal point.\n\n== COHERENCE (reads_as_snapshot) ==\nAll three panes must describe the SAME frozen moment. Tie them together\nexplicitly with shared concrete anchors: the same file:line, the same culprit\nframe, the same variable values, the same timestamp/thread. End with ONE short\ncoherence sentence stating the single instant (e.g. \"All panes show the instant\nitems[5] is read in process_line (order_processor.py:42), i=5 \u2265 len=5\").\n\n== STYLE RULES ==\n- Be concrete and specific; never use placeholders.\n- Keep it compact \u2014 short lines, small tables, small frame nodes. Assume it will\n be drawn and must fit without clipping.\n- Avoid long annotations, multi-line fix explanations, and decorative full-width\n ASCII boxes. Trim anything that doesn't directly satisfy a criterion.\n\n== CHECKLIST (every item MUST be satisfied) ==\n[ ] Panes stacked top-down, error/state above call stack\n[ ] Alert pane shows the exact error message\n[ ] Variable table of values at the failure point present\n[ ] Call-stack pane is a true flow with connectors, ordered top-down\n[ ] Exactly one culprit frame visually highlighted (marker + distinct style)\n[ ] Every pane clearly labeled (error/state vs call stack)\n[ ] One coherent moment, tied by shared file:line/values/time", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_debug-snapshot/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_debug-snapshot/report.md new file mode 100644 index 0000000..382b373 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_debug-snapshot/report.md @@ -0,0 +1,5 @@ +# GEPA journey: debug-snapshot + +rubric criteria: 7; val: 2 + +**Seed 0.6360 → Best 0.6498 (+0.0138)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_diy-project-plan/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_diy-project-plan/best_prompts.json new file mode 100644 index 0000000..e93d0d0 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_diy-project-plan/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a 'diy-project-plan' board that will be RENDERED as a visual board. Your output is a spec the renderer turns into components, so every required element must be concrete, complete, and explicitly present \u2014 not merely described in prose.\n\nINPUTS\n- Use case: {topic}\n- Reader: {audience}\n- Goal: {purpose}\n\nTASK\nProduce a concise, top-down, single-column plan for a DIY project. Name SPECIFIC content (real values, labels, structure). Generic plans fail. The board is judged against a strict rubric; you MUST satisfy EVERY criterion below with a concrete, renderable artifact.\n\nCRITICAL LESSONS (these caused past failures \u2014 do not repeat):\n1. CONTENT OVERFLOWS/CLIPS in every prior attempt. Keep it COMPACT. Hard caps:\n - Materials/tools table: max ~8 rows total.\n - Build-steps checklist: max ~10 short steps (one line each, <12 words).\n - One intro line only (no long paragraphs).\n - Trim every section to essentials so it fits without scrolling/clipping.\n2. video_reference, budget_chart, and where_to_buy repeatedly scored 0.0 even when written as prose. They must be emitted as ACTUAL structured artifacts, not descriptions:\n - VIDEO CARD: include a real thumbnail IMAGE reference (a concrete image URL, e.g. a YouTube thumbnail https://img.youtube.com/vi/<VIDEO_ID>/hqdefault.jpg) AND a real video link. Use the literal word \"thumbnail\" with an image, plus title, channel, duration. The card must read as an image card, not text.\n - BUDGET CHART: emit a COMPLETE, valid VegaLite spec (with $schema, data.values, mark, encoding). It must stand alone and render. Always include a total callout.\n - WHERE TO BUY: emit an actual MAP component (an embeddable map reference) AND a paired numbered store list with location/pins. Make it clearly a map, not just a bullet list.\n3. title_badges must have a clear title PLUS a row of at least 3 badges (difficulty, time, cost) \u2014 keep badges short.\n\nREQUIRED SECTIONS (in this exact top-down order, single column, stacked, NOT a peer grid):\n1. Header \u2014 Title + one short subtitle + badge row (Difficulty, Time, Cost, optional Size). Use concrete real values.\n2. Materials & Tools table \u2014 columns: Item | Spec | Qty | Est. Cost. Keep \u22648 rows; costs must sum to the stated total.\n3. Interactive build-steps checklist \u2014 \u226410 checkbox steps, each one short line, in build order. Include a progress indicator (e.g. \"0 / N complete\").\n4. Video tutorial card \u2014 thumbnail IMAGE (concrete URL) + play overlay, title, channel, duration, one \"why watch\" line, and a link.\n5. Budget chart \u2014 full VegaLite JSON (horizontal bar, x=cost USD, y=category sorted by cost), data values matching the materials costs, plus a \"Total: $X\" callout.\n6. Where to buy \u2014 an embedded MAP with numbered pins + a matching numbered store list (store name, items they carry, distance). Include 3\u20134 real-type stores (Home Depot, Lowe's, Ace Hardware, a local lumber/landscape supplier).\n\nCONSISTENCY RULES\n- The budget chart values, the materials table costs, and the cost badge/total must all agree.\n- Map pins must match the store list one-to-one.\n- Keep the whole thing tight so nothing clips; favor fewer, sharper items over completeness.\n\nOUTPUT\nPlain text. State each section with its concrete content. Emit the VegaLite as valid JSON in a code block. End with a one-line note confirming single-column top-down flow.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_diy-project-plan/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_diy-project-plan/report.md new file mode 100644 index 0000000..b1a1c8d --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_diy-project-plan/report.md @@ -0,0 +1,5 @@ +# GEPA journey: diy-project-plan + +rubric criteria: 7; val: 2 + +**Seed 0.3605 → Best 0.4569 (+0.0964)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_er-diagram/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_er-diagram/best_prompts.json new file mode 100644 index 0000000..c2cf6b3 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_er-diagram/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'er-diagram' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a concrete, render-ready plan for an ER diagram. It is judged on comprehension, visual quality, geometry (everything on-canvas), and a strict rubric. PRIOR PLANS REPEATEDLY SCORED 0.0 ON \"relationship_edges\" AND \"arrowheads\" DESPITE DESCRIBING THEM IN PROSE. The fix: specify every edge as an explicit, structured connection record \u2014 not a narrative \u2014 with named markers on BOTH ends and concrete anchor points. Follow the rules below exactly.\n\n=== OUTPUT STRUCTURE (use these exact sections) ===\n\n1. CANVAS & LAYOUT\n- State the canvas as an explicit coordinate box, e.g. canvas = 1000 (wide) x 1400 (tall), origin top-left.\n- Place every entity box with explicit x,y (top-left corner), width, height so ALL boxes fit fully inside the canvas with >=40px margin on every side. NOTHING off-canvas.\n- Lay out top-to-bottom: assign each entity a tier (row). Junction/bridge tables go in the middle tiers between their parents. Give a single-column or clearly-tiered vertical arrangement.\n- Give explicit gutters (>=60px vertical between tiers) and confirm no two boxes' rectangles overlap by checking coordinate ranges.\n\n2. ENTITY BOXES\nFor each entity output a titled box with one row per field as:\n field_name : DATA_TYPE [KEY]\n- KEY markers: PK, FK, \"PK,FK\" for composite junction keys, UNIQUE where relevant.\n- For every FK write the exact target: \"FK -> Table.column\".\n- Use realistic SQL types (INT/SERIAL, VARCHAR(n), CHAR(n), TEXT, DATE, TIMESTAMP, DECIMAL(p,s), SMALLINT, BOOLEAN).\n\n3. RELATIONSHIP EDGES (THIS IS THE MOST FAILED CRITERION \u2014 BE EXPLICIT AND STRUCTURED)\nList each relationship as a numbered, self-contained record. Use EXACTLY this template, one per line per field:\n Edge N:\n from: <Entity>.<column> anchor: <side of box: top/bottom/left/right>\n to: <Entity>.<column> anchor: <side of box>\n label: \"<verb phrase>\" (e.g. \"places\", \"contains\", \"writes\")\n source_cardinality: <e.g. 1..1>\n target_cardinality: <e.g. 0..* or 1..*>\n source_marker: <named marker, e.g. one-bar \"|\" >\n target_marker: <named marker, e.g. crow's-foot \"<\" for many>\n route: <how the line travels; which box sides it leaves/enters and that it touches no unrelated box>\n\nREQUIREMENTS for edges/arrowheads:\n- EVERY edge MUST have a visible marker on BOTH ends. State them explicitly:\n - \"one\" side = bar marker \"|\" (or \"||\" for exactly-one).\n - \"many\" side = crow's-foot marker \"<\" (or \">\").\n - optional/zero = circle \"o\" prepended (e.g. \"o<\" for zero-or-many).\n Write the literal marker glyphs so they are unambiguously present, e.g. target_marker: \"o<\" (zero-or-many, crow's-foot).\n- The cardinality label AND the marker are separate, both required, on the same edge.\n- Anchors must connect to box SIDES that face each other so the line is short and crosses no other box.\n- Resolve every many-to-many through its junction table with TWO edges (parent1 -> junction, parent2 -> junction), each fully specified.\n\n4. ORPHAN CHECK\n- List each entity and the edge numbers it participates in. Every entity MUST appear in >=1 edge. If any entity has no relationship, ADD a real, sensible relationship for it (do not leave it). Self-referencing FKs (e.g. Category.parent_id -> Category.id) must ALSO be drawn as an explicit edge with markers.\n\n5. GEOMETRY / NON-CROSSING PROOF\n- For each edge, name the two box sides it uses and confirm by coordinates it does not pass through any third box.\n- Re-confirm all boxes are within canvas bounds (give the max x+width and y+height and show they are < canvas size).\n\n=== HARD RULES ===\n- Be concrete with REAL values (table names, column names, types, labels) drawn from the use case. Generic plans fail.\n- Top-to-bottom flow only.\n- Every edge: label + cardinality + named marker on BOTH ends, all explicitly written.\n- No off-canvas elements, no overlapping boxes, no edges through unrelated boxes, no orphan entities.\n- Add one short reader-facing takeaway note tailored to the Goal (e.g. the key join path or the M:N highlight).\n\nOutput plain text following the section structure above.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_er-diagram/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_er-diagram/report.md new file mode 100644 index 0000000..cfcbe31 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_er-diagram/report.md @@ -0,0 +1,5 @@ +# GEPA journey: er-diagram + +rubric criteria: 7; val: 2 + +**Seed 0.4758 → Best 0.5143 (+0.0385)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_event-driven/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_event-driven/best_prompts.json new file mode 100644 index 0000000..21fb310 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_event-driven/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'event-driven' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Producers fan in to a central broker/topic node which then fans out to consumers\n- A clearly identified broker/topic/queue node sits between producers and consumers\n- Stream/queue edges are visually marked as flowing (animated dash) rather than static plain lines\n- A dead-letter/error path is shown with a dashed red edge to a DLQ node\n- The topology is laid out top-to-bottom\n- Producer, broker, and consumer nodes are color-coded by role (not all grey)\n- No node boxes overlap and edges do not pass through unrelated nodes\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "You generate a single valid JSON object describing an event-driven architecture board (flow diagram). The diagram visualizes a publish/subscribe / event-bus pipeline.\n\nINPUTS YOU RECEIVE\n- skill_context, topic, audience, purpose, and a detailed Plan.\n- The Plan names SPECIFIC tiers, nodes, colors (hex), edge labels, partitions, consumer groups, and DLQ details. Bake EVERY specific value from the Plan into the output. No placeholders.\n\nOUTPUT\nOutput ONLY the JSON object \u2014 no prose, no markdown fences.\n\n============================================================\nSCHEMA (follow field placement EXACTLY)\n============================================================\n{\n \"direction\": \"TB\",\n \"groups\": [ { \"id\", \"label\", \"color\" } ],\n \"nodes\": [ { \"id\", \"group\", \"position\": {\"x\",\"y\"}, \"color\", \"data\": { \"label\", \"status\" } } ],\n \"edges\": [ { \"source\", \"target\", \"animated\", \"dashed\", \"color\", \"data\": { \"label\" } } ]\n}\n\nCRITICAL FIELD-PLACEMENT RULES (these were the main scoring failures):\n\n1. ANIMATED STREAM EDGES \u2014 put `animated` and `dashed` at the EDGE TOP LEVEL, NOT inside `data`.\n Every normal streaming edge (producer\u2192broker fan-in, broker\u2192consumer fan-out) MUST have:\n \"animated\": true, \"dashed\": true\n Also give these flow edges a stream color at top level (e.g. the broker/orange or blue hex).\n (Past outputs scored animated_streams 0.0 because these fields were missing or buried in data.)\n\n2. DEAD-LETTER PATH \u2014 must be a dedicated edge from each FAILING CONSUMER (not from the broker/topic)\n to a dedicated DLQ node. Each DLQ edge MUST be:\n \"animated\": true, \"dashed\": true, \"color\": \"<RED hex from plan>\"\n and carry an error label (e.g. \"delivery failed \u00d75\", \"write failure / retries exhausted\",\n \"deserialization error\"). Route DLQ edges from consumers, give the DLQ node its own RED group.\n (dead_letter_path was weak when DLQ came from the topic instead of consumers, lacked red color,\n or lacked dashed/animated styling.)\n\n3. NODE COLORING \u2014 set `color` on EACH node (the hex of its tier), in addition to the group color.\n Do not rely on group color alone. (nodes_colored 0.5 otherwise.)\n\n============================================================\nLAYOUT / GEOMETRY (avoid overlaps and off-canvas)\n============================================================\nUse explicit `position: {x, y}` on every node. Keep ALL coordinates within canvas bounds:\nx in [0, 900], y in [0, 700]. Do NOT exceed these (off-canvas penalties occurred at x=650+ with\nwide nodes, and at large spreads).\n\nTop-to-bottom 4-tier layout, generous spacing:\n- Tier 1 Producers: y = 40. Spread x evenly, e.g. x = 120, 360, 600 (use only as many as named).\n- Tier 2 Broker: y = 220. Single centered node, x \u2248 360.\n- Tier 3 Consumers: y = 400. Spread x evenly, e.g. x = 120, 360, 600.\n- Tier 4 DLQ: y = 580. Offset toward right but in-bounds, e.g. x = 700.\n\nSpacing rules to prevent overlap:\n- Minimum 200px horizontal gap between sibling nodes' x positions.\n- Minimum 150px vertical gap between tiers.\n- Keep labels concise so wide nodes don't push past x=900. Prefer short labels; put extra detail\n after a newline (\\n) rather than one very long line.\n- Place the DLQ offset to the right and below consumers so red edges route around (not through)\n consumer nodes.\n\n============================================================\nEVENT-BUS DOMAIN RULES (always apply)\n============================================================\n- direction MUST be \"TB\" (strict top-to-bottom flow).\n- Exactly ONE broker/topic node sits between producers and consumers. ALL producers fan IN to it;\n it fans OUT to ALL consumers. It is the only node any cross-tier flow edge touches.\n- 4 role-colored tiers with distinct hex colors (use the exact hexes the Plan specifies):\n Producers, Broker/Topic, Consumers, Dead-Letter.\n- Fan-in edges: producer \u2192 broker, label like \"produce\"/\"publish\", animated+dashed.\n- Fan-out edges: broker \u2192 consumer, label with the consumer group (e.g. \"group: ledger-cg\")\n or \"subscribe/poll\", animated+dashed.\n- DLQ edges: failing consumer \u2192 DLQ node, animated+dashed+RED, error label.\n- node `status` values: producers \"info\", broker \"warn\" or \"active\", consumers \"success\",\n DLQ \"warn\". Use valid status strings only.\n- group ids must match the `group` field referenced by nodes. edge source/target must match node ids.\n\nCHECKLIST before output:\n\u2611 Every flow edge has top-level animated:true, dashed:true, color.\n\u2611 DLQ edges originate from consumers, are red, dashed, animated, with error labels.\n\u2611 Every node has a position and a color, all coords within [0,900]\u00d7[0,700].\n\u2611 Sibling nodes \u2265200px apart; tiers \u2265150px apart; no overlaps.\n\u2611 Exactly one broker node; all specific Plan values (partitions, groups, event names) included.\n\u2611 Output is ONLY the JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_event-driven/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_event-driven/report.md new file mode 100644 index 0000000..3232c1a --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_event-driven/report.md @@ -0,0 +1,5 @@ +# GEPA journey: event-driven + +rubric criteria: 7; val: 2 + +**Seed 0.4616 → Best 0.5363 (+0.0747)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_explainer/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_explainer/best_prompts.json new file mode 100644 index 0000000..7f834d2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_explainer/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'explainer' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- The top pane opens with a plain-language hook/surprising claim, not the abstraction\n- A flow pane shows the reasoning chain as color-coded nodes (postulate\u2192reasoning\u2192conclusion)\n- A vegalite pane shows the key quantitative relationship with a title saying what to notice\n- A component pane gives concrete numbers and a real-world anchor\n- A common misconception is addressed (an alert)\n- The panes are stacked top-down as a cohesive explainer (hook\u2192mechanism\u2192chart\u2192grounding)\n- Each layer is short, building intuition over symbols\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_explainer/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_explainer/report.md new file mode 100644 index 0000000..475bffa --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_explainer/report.md @@ -0,0 +1,5 @@ +# GEPA journey: explainer + +rubric criteria: 7; val: 2 + +**Seed 0.4588 → Best 0.4588 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_flame-graph/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_flame-graph/best_prompts.json new file mode 100644 index 0000000..0a436c4 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_flame-graph/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'flame-graph' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Frames are stacked by call depth (deeper calls below their parent)\n- Each frame's width represents its share of CPU samples/time\n- Child frames sit within the horizontal x-range of their parent\n- Frames are colored on a warm palette by self-time (hotter = more self-time)\n- Frames show a (possibly truncated) function name and time\n- A caption notes that width = time\n- The widest (hot-path) frame stack is identifiable\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_flame-graph/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_flame-graph/report.md new file mode 100644 index 0000000..636bb33 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_flame-graph/report.md @@ -0,0 +1,5 @@ +# GEPA journey: flame-graph + +rubric criteria: 7; val: 2 + +**Seed 0.7178 → Best 0.7178 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_flight-comparison/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_flight-comparison/best_prompts.json new file mode 100644 index 0000000..10e3c26 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_flight-comparison/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'flight-comparison' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each flight option is its own card/row in a comparison\n- Each flight shows its total price\n- Each flight shows departure/arrival times and total duration\n- Each flight shows its number of stops (nonstop vs layovers)\n- The cheapest or best-overall option is flagged with a badge\n- Each flight shows its airline (logo or name)\n- Each flight has an outbound book/select link\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "You generate a SINGLE valid JSON object describing a UI built from Mantine-like\ncomponents. The output renders visually and is scored on: composition (visual\nbalance/density), visual quality, geometry (must NOT overflow or clip), and\nrubric coverage. Output ONLY the JSON object \u2014 no prose, no markdown fences, no\nexplanation. Your response MUST begin with \"{\" and end with \"}\".\n\n\u26a0\ufe0f CRITICAL FAILURE MODES OBSERVED (do not repeat):\n1. Producing NO JSON at all (e.g. echoing the plan, writing prose, or stopping\n early). You MUST always emit the complete JSON object. This is the single\n most important rule \u2014 an empty or non-JSON response scores zero.\n2. Content overflow/clipping (geometry penalty). Caused by too many cards, too\n much content per row, oversized logos, nested cards, dividers + badge rows\n eating vertical space. Keep every card to a SINGLE row of Grid content.\n\n========================================================================\nINPUT FORMAT\n========================================================================\nYou receive: skill context, a Use case, a Reader (audience), a Goal, and a\ndetailed Plan (markdown). The Plan names SPECIFIC values (airlines, prices,\ntimes, durations, stops, badges, footer notes, routes, dates). Bake those EXACT\nvalues from the PLAN into the JSON. Never use placeholders like \"TBD\", \"$XXX\",\nor generic labels. Never copy values from these instructions' examples \u2014 always\nread the actual plan you are given and use ITS airlines, routes, prices, and\ndates. (Past failures mismatched the title/route from a different plan.)\n\n========================================================================\nTASK (flight comparison board)\n========================================================================\nBuild a flight comparison board. The rubric requires, for EVERY flight in the plan:\n1. Its own card (one card per flight).\n2. Total price shown prominently (see PRICE rule below).\n3. Departure + arrival times AND total duration.\n4. Number of stops (Nonstop, or \"1 stop\" with layover airport + duration).\n5. Airline name + logo.\n6. A booking link/button (\"Select Flight \u2192\" or similar) per card.\nAnd ONE flight must be clearly flagged as cheapest/best (see FLAG rule).\n\nInclude ALL flights named in the plan (some plans have 3, some 5). Use the exact\ntitle, subtitle (route/dates/sort), and footer note from the plan.\n\n========================================================================\nCOMPONENT VOCABULARY (use only these)\n========================================================================\nStack {gap}, Group {justify, align, gap}, Grid {columns, align} + Grid.Col {span},\nCard {withBorder, padding, radius, style}, Title {order}, Text {fw, size, c},\nBadge {color, size}, Image {src, alt, h, w}, Anchor {href}, Divider, Alert\n{color, title}, Blockquote {color}, Table {data:{head,body}}, SimpleGrid {cols,\nspacing}.\n- Every node is {\"type\": \"...\", \"props\": {...}, \"children\": [...]}.\n- Logos: use \"https://placehold.co/40x40?text=XX\" (small, ~40px) where XX is the\n airline/flight code. Avoid large logos that bloat card height.\n- Real booking hrefs when known: Delta https://www.delta.com, United\n https://www.united.com, American https://www.aa.com, JetBlue\n https://www.jetblue.com, Spirit https://www.spirit.com, Norse\n https://flynorse.com, Aer Lingus https://www.aerlingus.com, Icelandair\n https://www.icelandair.com. If unknown, use the airline's plausible domain.\n\n========================================================================\nLAYOUT STRATEGY \u2014 prioritize geometry (avoid overflow/clipping)\n========================================================================\nUSE the vertical-stack layout:\n- Top-level: Stack {gap:\"md\"}.\n- A Title (order 2) with the board name from the plan, then a dimmed Text\n subtitle (route / dates / sort note) from the plan.\n- A compact column-header strip as a bordered Card (padding \"sm\") containing a\n Grid (12 cols) with dimmed bold Text labels using these spans:\n Airline (span 3) | Times (span 3) | Duration/Stops (span 2) | Price (span 2) | Action (span 2).\n- One full-width Card per flight, each containing ONE Grid (12 cols, align\n \"center\") with the SAME column spans as the header strip.\n- Close with an Alert (color \"green\") with a title naming the winning pick and\n price, summarizing the recommendation / cheapest pick reason from the plan.\n\nKEEP CARDS SHORT to avoid clipping:\n- Do NOT add a Divider inside flight cards.\n- For the flagged card, place the winner Badge in the SAME Grid as a small extra,\n OR put it as the first cell \u2014 do not add an extra full Group row + Divider that\n inflates height. Prefer a single compact row. (Extra badge-row + divider per\n card contributed to overflow.)\n- Limit per-card vertical content: airline = Group {Image + Text}; times = one\n Text; duration/stops = Stack {gap:0} with two short Texts; price = Text (plus\n at most one tiny dimmed note); action = Anchor.\n\nDO NOT:\n- Use SimpleGrid {cols:3} with tall content \u2014 it caused content overflow/clipping.\n- Nest Cards inside Cards.\n- Put flight details inside a Table inside a narrow card \u2014 buries price and\n overflows. Keep details as inline Grid.Col Text elements.\n- Add long Alerts/notes inside every card; keep per-card extras minimal.\n\n========================================================================\nPRICE rule (fixes price_shown 0.5)\n========================================================================\nRender the total price as a STANDALONE, prominent element on each card:\nText {fw:700, size:\"xl\"} with the exact value from the plan (e.g. \"$298\"). Do\nNOT place the price inside a Table or mix it into a duration/stops block. If a\nper-person, family-total, or \"before bag fees\" note exists in the plan, add a\nsmall dimmed Text {size:\"xs\"} beneath it \u2014 but the dollar figure stays bold/large.\n\n========================================================================\nFLAG rule (fixes cheapest_or_best_flagged 0.5)\n========================================================================\nMake the flagged flight UNMISTAKABLE \u2014 use BOTH a badge AND a highlighted card:\n- Badge {color:\"green\", size:\"lg\"} with text like \"\ud83c\udfc6 CHEAPEST\" or\n \"\ud83c\udfc6 BEST VALUE\" (match the plan's flag wording).\n- AND give that card a distinct border:\n Card {withBorder:true, style:{\"borderColor\":\"green\",\"borderWidth\":2}}.\nPlace the winner Badge compactly (e.g. above the airline name in the airline\nGrid.Col, or as a small Group at card top \u2014 but avoid adding a Divider after it).\nOther cards may carry neutral/secondary badges (e.g. \"Fastest\", color \"blue\" or\n\"gray\", size \"md\") but only the chosen one gets the green winner treatment.\nReference the flag reason in the closing Alert.\n\n========================================================================\nSTOPS / TIMES formatting\n========================================================================\n- Nonstop: Text {c:\"green\", fw:600, size:\"sm\"} \"Nonstop\".\n- 1+ stops: Text {c:\"orange\", fw:600, size:\"sm\"} e.g. \"1 stop \u2014 1h 20m ORD\"\n (include layover airport + duration).\n- Times: Text {size:\"sm\"} \"7:05 AM (JFK) \u2192 1:18 PM (SFO)\"; add \"(+1 day)\" for\n overnight/red-eye arrivals exactly as the plan indicates.\n- Duration (\"6h 13m\") in Text {size:\"sm\", fw:600} ABOVE the stops text, both\n inside a Stack {gap:0}.\n\n========================================================================\nGENERAL\n========================================================================\n- Satisfy EVERY rubric criterion with the EXACT plan values.\n- Keep it dense but balanced; favor the flat vertical layout for clean geometry.\n- Re-check before finishing: did you emit valid JSON? Does the title match THIS\n plan's route? Is every flight present? Is exactly one flight green-flagged?\n- Output ONLY the single JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_flight-comparison/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_flight-comparison/report.md new file mode 100644 index 0000000..a71ad17 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_flight-comparison/report.md @@ -0,0 +1,5 @@ +# GEPA journey: flight-comparison + +rubric criteria: 7; val: 2 + +**Seed 0.8681 → Best 0.9226 (+0.0545)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_gantt/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_gantt/best_prompts.json new file mode 100644 index 0000000..ab5d170 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_gantt/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'gantt' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nProduce a CONCISE plain-text plan that names SPECIFIC content (real dates, labels, names, hex colors, structure). Generic plans fail. Every section below is scored against a strict rubric \u2014 satisfy EVERY criterion explicitly.\n\nREQUIRED SECTIONS (use these headers, in this order):\n\n1. TITLE & SUBTITLE\n - One concrete title naming the project.\n - One subtitle that includes the today/status date AND the timeline window. Use full ISO dates (YYYY-MM-DD) in the subtitle, e.g. \"Status as of 2025-06-09 \u00b7 Timeline: 2025-05-05 \u2192 2025-08-15\".\n\n2. X-AXIS (dated temporal axis)\n - State an explicit start date and end date in ISO format (YYYY-MM-DD).\n - State tick interval (daily/weekly/monthly) with 5\u20137 example tick labels.\n - State the date format used on ticks AND the format used in tables/tooltips. IMPORTANT: use full ISO (YYYY-MM-DD) for all dates in the TASKS table and ALL tooltips. Abbreviated formats are allowed ONLY on axis ticks.\n\n3. TASKS (horizontal bars, sorted by start date ascending, top\u2192bottom)\n - A table with columns: Task | Start | End | Status | Owner.\n - 7\u20139 rows. Status MUST be one of exactly: done / active / blocked / planned.\n - Rows MUST be listed in ascending start-date order (earliest at top).\n - Use real ISO dates (YYYY-MM-DD) that fall inside the x-axis window, real owner names, and a mix of statuses that includes at least one done, one active, one blocked, and one planned.\n - State plainly: each row = one horizontal bar from its start date to its end date; y-axis is ordered by start date.\n\n4. STATUS COLOR SCALE (explicit legend)\n - Map ALL FOUR statuses to specific hex colors:\n done = #4CAF50 (green), active = #2196F3 (blue), blocked = #F44336 (red), planned = #9E9E9E (gray).\n - State the legend is rendered with swatches + labels.\n\n5. TODAY RULE LINE (heavily weighted \u2014 be thorough)\n - A vertical rule at a specific ISO date (e.g. \"2025-06-09\") that falls WITHIN the x-axis window and intersects at least one active AND at least one blocked bar (name those bars and their date ranges).\n - Specify it is a DASHED line in a distinct color: dark orange #FF6F00.\n - Specify a VISIBLE TEXT LABEL anchored at the top of the line, stating the exact text, e.g. label = \"Today \u2014 Jun 9\".\n - Confirm the today date is identical to the subtitle's status date.\n\n6. TOOLTIP SPECIFICATION (heavily weighted \u2014 this is the section most often scored low; be exhaustive)\n - State that EVERY bar shows a hover tooltip containing exactly these four fields, each on its own line, with these exact labels:\n Task: <task name>\n Status: <Done | Active | Blocked | Planned>\n Owner: <owner name>\n Dates: <start date> \u2192 <end date>\n - Dates in tooltips MUST be full ISO format (YYYY-MM-DD \u2192 YYYY-MM-DD). Do NOT abbreviate.\n - First give the field template, then FILL IT IN FOR EVERY TASK in the TASKS table (all 7\u20139 rows), not just a few examples. Each filled tooltip must populate all four fields with the row's real task name, capitalized status, real owner, and full ISO start\u2192end dates.\n - Ensure at least one filled tooltip is a blocked task, one is an active task, and one is a done task.\n - Explicitly confirm the tooltip names all four required fields: Task, Status, Owner, AND Dates (start and end).\n\nSTYLE: Be tight and concrete. Avoid filler prose, narrative \"takeaway\" paragraphs, and repetition. Every claim must carry a specific value (a date, a name, a color, a label string). The only place exhaustive repetition is required is the filled tooltips in Section 6 \u2014 provide one for every task.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_gantt/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_gantt/report.md new file mode 100644 index 0000000..1ed88a8 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_gantt/report.md @@ -0,0 +1,5 @@ +# GEPA journey: gantt + +rubric criteria: 7; val: 2 + +**Seed 0.3085 → Best 0.4435 (+0.1350)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_growth-funnel/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_growth-funnel/best_prompts.json new file mode 100644 index 0000000..690f2f2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_growth-funnel/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'growth-funnel' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nProduce a CONCISE plain-text plan that names SPECIFIC, concrete content (real numeric values, exact labels, explicit structure). Generic plans fail. The plan is judged against a strict rubric \u2014 EVERY criterion below must be satisfied explicitly and unambiguously.\n\nREQUIRED CONTENT \u2014 produce these labeled sections in this order:\n\n1. TITLE & SUBTITLE\n - Title: a specific descriptive chart title naming the funnel.\n - Subtitle: include the explicit time period (e.g. \"Mar 1\u201331, 2024\") plus scope/total if relevant.\n\n2. STAGES (DESCENDING) \u2014 this is the most important section\n - List the funnel stages with their counts as an explicit ordered list, sorted STRICTLY from the LARGEST count (widest, top of funnel) to the SMALLEST count (narrowest, bottom of funnel).\n - Verify and state that each count is smaller than the one above it (monotonically decreasing). Choose realistic values so that NO stage breaks the descending order.\n - State explicitly: \"Bars are sorted by count, descending \u2014 widest at top, narrowest at bottom\" and \"Stages render top\u2192bottom in funnel order.\"\n - Present the data as a clean ordered list, e.g.:\n 1. Applicants \u2014 1,200\n 2. Screened \u2014 500\n 3. Interviewed \u2014 180\n 4. Offered \u2014 60\n 5. Hired \u2014 30\n - Each entry IS the stage axis label AND its printed numeric count (bar length proportional to count, numeric value printed at the bar's end).\n\n3. AXES\n - Count axis (horizontal): give it an explicit TITLE (e.g. \"Number of Candidates\") and a numeric range from 0 to the largest count.\n - Stage axis (vertical): explicitly labels EACH stage by name, listed in the same descending funnel order, top\u2192bottom.\n\n4. BAR LABELS\n - State that every bar is annotated with its numeric count, and list those exact printed values (e.g. \"1,200\", \"500\", \"180\", \"60\", \"30\").\n\n5. TOOLTIP \u2014 must explicitly contain BOTH required fields\n - Give the exact tooltip format showing the STAGE NAME and the COUNT, e.g.: \"{Stage}: {Count} {unit}\" \u2192 \"Interviewed: 180 candidates\".\n - List a concrete example for at least one stage so both fields are unambiguous.\n\n6. READING ORDER\n - State explicitly that stages read top\u2192bottom in funnel order (widest first), and name the first and last stage to confirm direction.\n\nGUIDANCE:\n- Keep it tight and literal \u2014 favor explicit ordered lists over decorative prose, callout boxes, or extra commentary. Comprehension is scored; verbosity and clutter hurt it.\n- You MAY add a single brief drop-off/conversion annotation if it serves the reader's goal, but never let it obscure the required sections.\n- Double-check the two most-failed criteria before finishing: (a) counts are strictly descending top\u2192bottom, and (b) the tooltip clearly shows both stage AND count.", + "generate": "You are generating a single Vega-Lite chart specification (schema v6) from a structured plan. Output ONLY one valid JSON object \u2014 the Vega-Lite spec. No prose, no markdown, no placeholders. Bake in the EXACT title, subtitle, stage names, numeric counts, conversion percentages, axis titles, and annotations named in the plan.\n\nCONTEXT: {skill_context}\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\nPlan to follow:\n{plan}\n\nThis is a \"Growth-Funnel Board\" task: render a horizontal funnel as a bar chart, bars sorted WIDEST (top) \u2192 NARROWEST (bottom) in funnel order. Build the spec to satisfy these rubric criteria, which past attempts repeatedly failed:\n\n=== 1. STAGES DESCENDING & STAGE ORDER CLEAR (often scored 0) ===\n- Add an integer \"order\" field to every data row (1 = top/widest stage \u2026 N = bottom/narrowest).\n- Encode y with: {\"field\": \"stage\", \"type\": \"nominal\", \"sort\": {\"field\": \"order\", \"order\": \"ascending\"}, \"axis\": {\"title\": \"<stage axis title>\"}}.\n- The bars MUST visibly descend in length from top to bottom (counts are already descending; the order field guarantees vertical order).\n- Do NOT introduce a separate categorical color field (e.g. \"highlight\") that reorders or visually fragments the funnel. If you must color a highlighted stage, keep the funnel's descending order intact and keep a single coherent reading order. Prefer a single consistent bar color (e.g. \"#4c78a8\") unless the plan explicitly requires an accent on one stage.\n\n=== 2. TOOLTIP DETAILS (this scored 0.0 in EVERY example \u2014 fix it) ===\n- Put a full tooltip ARRAY directly in the bar mark's layer encoding, AND also keep \"tooltip\": true is NOT enough alone.\n- The tooltip array must list MULTIPLE fields: stage name, count (formatted with thousands separators), AND any context the plan provides (conversion % from prior, absolute drop from prior). Example:\n \"tooltip\": [\n {\"field\": \"stage\", \"type\": \"nominal\", \"title\": \"Stage\"},\n {\"field\": \"count\", \"type\": \"quantitative\", \"title\": \"Count\", \"format\": \",\"},\n {\"field\": \"conversion\", \"type\": \"nominal\", \"title\": \"Conversion from prior\"},\n {\"field\": \"drop\", \"type\": \"quantitative\", \"title\": \"Drop from prior\", \"format\": \",\"}\n ]\n- Include these extra fields (conversion, drop, etc.) in EVERY data row so the tooltip resolves. Match the plan's exact percentages and drop values.\n- Ensure the tooltip is bound to the BAR layer (the layer the user hovers), not buried where it won't render.\n\n=== 3. VALUE LABELS (printed numbers at bar tips \u2014 scored 0.5 when x omitted) ===\n- Add a text layer that prints the numeric count at each bar tip. This text layer MUST bind the x channel explicitly so labels sit at the bar end:\n {\n \"mark\": {\"type\": \"text\", \"align\": \"left\", \"dx\": 5, \"fontWeight\": \"bold\"},\n \"encoding\": {\n \"x\": {\"field\": \"count\", \"type\": \"quantitative\"},\n \"text\": {\"field\": \"count\", \"type\": \"quantitative\", \"format\": \",\"}\n }\n }\n- Format all counts with thousands separators (format \",\").\n- If the plan calls for a secondary muted label (conversion %), add a second text layer offset with dy, color \"#888\", smaller fontSize, also binding x to count.\n\n=== 4. AXES ===\n- x (count axis): {\"field\": \"count\", \"type\": \"quantitative\", \"title\": \"<exact count axis title from plan>\", \"scale\": {\"domain\": [0, <max from plan>]}, \"axis\": {\"format\": \",\", \"grid\": true}}. Use the plan's exact axis title and domain max.\n- y (stage axis): as in section 1.\n\n=== STRUCTURE TEMPLATE ===\n{\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v6.json\",\n \"title\": {\"text\": \"<plan title>\", \"subtitle\": \"<plan subtitle>\"},\n \"width\": \"container\",\n \"height\": 300-320,\n \"data\": {\"values\": [ /* one row per stage with stage, count, order, conversion, drop, ... */ ]},\n \"encoding\": { \"y\": { /* stage, sorted by order ascending */ } },\n \"layer\": [\n { \"mark\": {\"type\": \"bar\", \"tooltip\": true, \"color\": \"#4c78a8\"},\n \"encoding\": { \"x\": { /* count, scale domain, axis title */ }, \"tooltip\": [ /* full multi-field array */ ] } },\n { /* value-label text layer with explicit x binding */ },\n { /* optional secondary conversion-% text layer */ }\n ]\n}\n\nRULES:\n- Define the shared y encoding once at the top level; define x inside the bar layer (and bind x in each text layer).\n- Every data row must carry all fields referenced by any tooltip or text encoding.\n- Use the plan's literal values everywhere \u2014 never invent or placeholder.\n- Output must be a single, parseable JSON object and nothing else." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_growth-funnel/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_growth-funnel/report.md new file mode 100644 index 0000000..7a33f9b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_growth-funnel/report.md @@ -0,0 +1,5 @@ +# GEPA journey: growth-funnel + +rubric criteria: 7; val: 2 + +**Seed 0.4108 → Best 0.4975 (+0.0867)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_k8s-topology/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_k8s-topology/best_prompts.json new file mode 100644 index 0000000..b04a544 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_k8s-topology/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a 'k8s-topology' board that will be RENDERED on a fixed canvas and scored against a strict automated rubric. The rubric checks the actual rendered board \u2014 not your prose \u2014 so you must specify exact, literal, renderable values (coordinates, hex colors, status strings, edge endpoints) for EVERY element. Vague or prose-only descriptions score 0 even if conceptually correct.\n\nINPUTS:\n- Use case: {topic}\n- Reader: {audience}\n- Goal: {purpose}\n\nCANVAS & GEOMETRY (critical \u2014 prior plans lost points to off-canvas nodes):\n- Assume a canvas of 1200 (wide) x 800 (tall), origin top-left.\n- Give every node an explicit (x, y) top-left position AND a size (w, h). Keep every node fully inside 40..1160 horizontally and 40..760 vertically (leave margins). Never place a node or any part of it outside these bounds.\n- Zone containers also need explicit (x, y, w, h) and must enclose their child nodes with padding; children must NOT spill outside their zone.\n- Minimum 30px gap between any two nodes so nothing overlaps.\n- Route edges so they connect only the two endpoints and do not pass through any third (unrelated) node. State each edge's source-anchor and target-anchor side (e.g., \"from bottom of A to top of B\") chosen so the line is clear of other nodes.\n\nREQUIRED CONTENT \u2014 satisfy EVERY rubric criterion with concrete values:\n\n1. NAMESPACE/CLUSTER ZONES (label each explicitly):\n - One outer cluster zone, e.g. \"Cluster: prod-us-east-1\".\n - At least two labeled namespace zones, e.g. \"Namespace: ingress-nginx\" and \"Namespace: web\" (pick names fitting the use case).\n - A separate \"Nodes\" zone for Node objects.\n - Each zone must have a visible text label AND explicit x/y/w/h. Place resources INSIDE the correct zone's bounds.\n\n2. RESOURCE KINDS as labeled nodes \u2014 include ALL of: Ingress, Service, Deployment, Pod(s), ConfigMap, Secret, Node. Label each node with both KIND and NAME, e.g. \"Ingress: api-ingress\", \"Service: api-svc (ClusterIP :80\u21928080)\", \"Deployment: api (replicas 3)\", \"ConfigMap: api-config\", \"Secret: api-secret\", \"Node: node-1\". If the use case truly has no Ingress (e.g. a queue worker), still include all other kinds and add a note \u2014 but prefer including every kind when plausible.\n\n3. TRAFFIC PATH must be fully traceable via edges. Spell out the EXACT chain of edges as separate edges:\n Ingress -> Service -> (Deployment is NOT in the traffic path) -> each Pod.\n Concretely: edge Ingress->Service, then edge Service->Pod1, Service->Pod2, Service->Pod3. The Service must connect DIRECTLY to the Pods (not to the Deployment) for the traffic path. Lay these out in a single straight visual lane (e.g. left-to-right or top-to-bottom column) so the chain is unbroken and uncrossed.\n\n4. EDGE COLORS BY PURPOSE \u2014 use these EXACT, distinct hex colors and apply them literally to each edge (do not just name them in the legend; assign a color to every edge):\n - traffic = #1E88E5 (blue, solid)\n - manages = #8E24AA (purple, solid) [Deployment -> each Pod]\n - config-mount = #FB8C00 (orange, dashed) [ConfigMap -> each Pod; Secret -> each Pod]\n - scheduled-on = #757575 (gray, dotted) [each Pod -> its Node]\n List every edge with: source, target, purpose, hex color, line style. Four purposes must use four clearly different colors.\n\n5. POD STATUS \u2014 every Pod node must carry a literal status string in its label (e.g. \"Status: Running\") AND a fill color matching status:\n - Running = green fill #43A047\n - Pending = amber fill #FDD835\n - Failed/CrashLoopBackOff = red fill #E53935\n Give at least the realistic mix for the use case; state each pod's status and fill hex explicitly.\n\n6. LEGEND \u2014 include an explicit legend node placed on-canvas (give it x/y/w/h, e.g. bottom-left at x=60,y=640,w=260,h=110). It must list every edge purpose mapped to its exact hex color AND the pod status colors:\n - #1E88E5 = traffic\n - #8E24AA = manages\n - #FB8C00 = config-mount\n - #757575 = scheduled-on\n - #43A047 = Pod Running (+ #FDD835 Pending if used)\n The legend is a required visible board element; describe its rows literally.\n\n7. NO OVERLAP / NO CROSSING THROUGH NODES \u2014 explicitly describe the layout that prevents this: place ConfigMap and Secret OFF the central traffic axis (e.g. to one side) so config-mount edges route around the pod column; route manages edges (Deployment->Pods) and config-mount edges on opposite sides of the pod column; drop scheduled-on edges straight down to the Nodes zone so they don't cross traffic edges.\n\nOUTPUT FORMAT:\nA concise plain-text plan with these sections:\n- Zones: each with label + (x,y,w,h).\n- Nodes: each with kind+name label, (x,y,w,h), and (for pods) status string + fill hex.\n- Edges: a table/list of every edge with source, target, purpose, hex color, line style, and the anchor sides used to avoid crossings.\n- Legend: explicit rows with hex colors (with its own x,y,w,h).\n- Traffic path: the explicit edge chain Ingress->Service->Pods.\n- Layout/anti-overlap notes.\n\nUse REAL, specific values (concrete names, ports, hex colors, coordinates). Generic or coordinate-free plans fail. Confirm at the end that every node lies within canvas bounds (40..1160, 40..760) and that all 7 rubric criteria are met.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_k8s-topology/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_k8s-topology/report.md new file mode 100644 index 0000000..b0668cc --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_k8s-topology/report.md @@ -0,0 +1,5 @@ +# GEPA journey: k8s-topology + +rubric criteria: 7; val: 2 + +**Seed 0.3541 → Best 0.4394 (+0.0853)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_map-routes/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_map-routes/best_prompts.json new file mode 100644 index 0000000..32ee63e --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_map-routes/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'map-routes' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nOutput a concise plain-text plan that names SPECIFIC, concrete content (real place names, real-looking coordinates, real labels, distances, times) for every element below. Generic plans fail.\n\nSTRUCTURE \u2014 USE A HUB-AND-SPOKE (STAR) LAYOUT:\n- Define ONE origin and 3\u20135 destinations.\n- Draw a SEPARATE route polyline from the ORIGIN directly to EACH destination (origin\u2192dest1, origin\u2192dest2, ...). Do NOT chain destinations together into a tour (A\u2192B\u2192C). Do NOT add return-to-origin legs. Each route must visibly connect the origin to one destination.\n- Keep it simple and easy to read \u2014 favor clarity over optimization narratives.\n\nSATISFY EVERY RUBRIC CRITERION EXPLICITLY:\n\n1. Interactive map with tiles\n - State: OpenStreetMap (Leaflet) tile layer, pan/zoom enabled.\n - Center on a real city; give approximate center coords.\n\n2. Multiple labeled markers (1 origin + 3\u20135 destinations)\n - Present markers in a table with columns: Marker | On-map Label | Coordinates | Color.\n - Every marker MUST have a short, distinct, descriptive on-map text label (e.g. \"Depot \u2014 Central Warehouse\", \"Acme Office Supply\"). State explicitly that these labels render as visible text/tooltips on each marker. Do not leave markers as bare pins.\n\n3. Route polylines (origin \u2192 each destination)\n - One polyline per destination, each a distinct color. List them in a table: Leg | From \u2192 To | Line color | Label.\n\n4. Route labels \u2014 THIS IS THE MOST COMMONLY FAILED CRITERION; BE EXPLICIT:\n - EVERY polyline carries its OWN label rendered AS TEXT ON/AT THE LINE (anchored at the line's midpoint as a white-background tooltip).\n - Each label states distance AND time, e.g. \"4.2 km \u00b7 11 min\".\n - State plainly: \"Each of the N route lines has a midpoint text label drawn directly on the map.\" Labels in the legend alone do NOT count \u2014 they must be on the lines.\n\n5. Color-coded markers\n - Origin = GREEN. All destinations = a single contrasting color (e.g. RED or BLUE). State this explicitly.\n\n6. Legend table beside the map\n - A table to the right of the map keying: origin pin, destination pin, and each route line (with its distance/time).\n - Include a total-distance/total-time summary row.\n\n7. Full-extent view (no markers cut off)\n - State: use fitBounds over ALL marker coordinates with ~10\u201312% padding so every marker and label is visible.\n\nLAYOUT:\n- Left ~70%: interactive tile map with markers and labeled polylines.\n- Right ~30%: legend table + summary row.\n\nKeep the whole plan concrete and uncluttered. Prioritize: (a) every route line explicitly drawn from origin to a destination, (b) every route line carrying its own on-map distance+time text label, (c) every marker carrying a distinct on-map text label.", + "generate": "You are generating a single valid JSON UI specification (Mantine-style component tree) for an interactive \"Map-Routes\" board. Output ONLY the JSON object \u2014 no prose, no markdown, no code fences.\n\n## INPUT FORMAT\nYou receive:\n- skill_context: general component/rendering rules\n- Use case / Reader / Goal: the scenario\n- Plan: a detailed markdown spec naming SPECIFIC values (title, coordinates, marker labels, colors, route legs, distances/times, legend rows, totals)\n\nBake in the SPECIFIC values from the plan \u2014 never use placeholders.\n\n## CRITICAL: THE Map COMPONENT SCHEMA\nPast attempts scored 0.0 on EVERY map rubric criterion (interactive_map, labeled_markers, route_lines, route_labels, marker_colors, fits_destinations) because invented prop names were used and the map rendered empty. You MUST use the exact prop schema below. Do NOT invent alternative prop names.\n\nThe Map component MUST be authored as:\n\n{\n \"type\": \"Map\",\n \"props\": {\n \"height\": 460,\n \"center\": { \"lat\": <number>, \"lng\": <number> },\n \"zoom\": <integer>,\n \"fitBounds\": true,\n \"markers\": [\n {\n \"lat\": <number>,\n \"lng\": <number>,\n \"label\": \"<short text label shown on/near marker>\",\n \"popup\": \"<click popup text>\",\n \"color\": \"green\" | \"blue\" | \"orange\" | \"red\"\n }\n ],\n \"routes\": [\n {\n \"color\": \"<color>\",\n \"label\": \"<e.g. '2.1 km \u00b7 7 min'>\",\n \"path\": [ { \"lat\": <number>, \"lng\": <number> }, { \"lat\": <number>, \"lng\": <number> } ]\n }\n ]\n }\n}\n\nRules that make each rubric criterion pass:\n1. interactive_map \u2014 Always include `\"center\"` as a {lat,lng} object, an integer `\"zoom\"`, and a numeric `\"height\"`. Pan/zoom is enabled by default; do not add disabling props.\n2. labeled_markers \u2014 Every marker MUST have a non-empty `\"label\"` AND a `\"popup\"`. Include the origin plus every destination from the plan.\n3. marker_colors \u2014 The origin/home/depot marker MUST be `\"color\": \"green\"`. Destination/stop markers MUST be `\"color\": \"blue\"`. If the plan flags a final stop, you may make it `\"orange\"`, but keep at least one green origin and blue destinations distinct.\n4. route_lines \u2014 Use the `\"routes\"` array (NOT \"lines\" or \"polylines\"). Each route connects consecutive points using a `\"path\"` array of {lat,lng} objects, in plan order (Origin \u2192 Stop1 \u2192 Stop2 \u2192 ...). One route object per leg.\n5. route_labels \u2014 Every route object MUST have a `\"label\"` containing the leg's time and/or distance exactly as given in the plan.\n6. fits_destinations \u2014 Set `\"fitBounds\": true` so all markers stay in view. The `center`/`zoom` should still roughly frame the marker set. Do NOT pass a coordinate array to fitBounds; use the boolean `true`.\n\nUse {lat, lng} OBJECTS everywhere (not [lat, lng] arrays). This is the single most important correctness rule \u2014 array-style coordinates caused the empty-map failures.\n\n## SUPPORTING LAYOUT (raises completeness/vq)\nWrap the map in a clear board:\n- A Title (order 2) with the plan's exact title string.\n- A short dimmed Text or a Badge summarizing the route (total distance \u00b7 time \u00b7 stop count) using the plan's exact totals.\n- A Grid: left Grid.Col span 8 holds the Map; right Grid.Col span 4 holds a Card with:\n - A \"Legend\" Title (order 4/5)\n - A Table whose head/body rows exactly mirror the plan's legend table (symbols + meanings/details)\n - A Divider + a bold Text with the plan's total summary line.\n- Optionally below the grid, a Card or SimpleGrid breaking down each leg (route, time, distance) using plan values.\n\n## GENERAL RULES\n- Produce ONE root object (typically a Stack with gap \"md\").\n- Children that are single elements may be objects; multiple children must be arrays. Be consistent and valid JSON.\n- Tables use {\"data\": {\"head\": [...], \"body\": [[...], ...]}}.\n- Every value (coordinates, labels, distances, times, colors, titles, legend rows, totals) must come from the plan \u2014 bake them in literally.\n- Output ONLY the final JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_map-routes/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_map-routes/report.md new file mode 100644 index 0000000..661b657 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_map-routes/report.md @@ -0,0 +1,5 @@ +# GEPA journey: map-routes + +rubric criteria: 7; val: 2 + +**Seed 0.2908 → Best 0.4962 (+0.2054)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_metrics-dashboard/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_metrics-dashboard/best_prompts.json new file mode 100644 index 0000000..c925041 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_metrics-dashboard/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'metrics-dashboard' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nProduce a concise plain-text plan that names SPECIFIC, REALISTIC content (real numbers, real labels, real chart data) \u2014 generic plans fail. The plan is judged on comprehension (does the content instantly answer the reader's goal?), visual quality, geometry, and a strict rubric.\n\n== RUBRIC (every item MUST be satisfied) ==\n1. A row of stat cards, each with a dimmed label (top) and a large value (below).\n2. Stat cards carry color-coded status badges where relevant.\n3. An embedded VegaLite trend chart inside a card.\n4. A clear dashboard title heading.\n5. The chart fits its card width (use VegaLite \"width\": \"container\" \u2014 never a fixed pixel width that bleeds).\n6. Clean top-down stack: title \u2192 stats row \u2192 chart.\n7. All stat values and labels are legible and aligned.\n\n== HOW TO SCORE HIGH ON COMPREHENSION (the hard part) ==\nComprehension is the weakest dimension. Maximize it by making the content itself tell the story:\n- Choose 4 stat cards that are the MOST decision-relevant metrics for THIS reader and goal. Lead with the headline metric.\n- Give each value a realistic, specific, in-domain number with correct units/formatting ($, %, ms, counts with commas, durations like \"2m 38s\").\n- Every badge must encode a clear judgment, not just a number. State the direction AND whether it's good or bad for the reader:\n - green = good/healthy, amber = watch/elevated, red = bad/breach.\n - Be careful with metric polarity: a RISING bounce rate / churn / error rate / latency is BAD (red); a rising MRR / sessions / conversion / uptime is GOOD (green). Annotate why (e.g. \"bounce up \u2192 red, worse\").\n - Where useful, name the threshold that triggers the badge (e.g. \"Elevated, threshold >2%\").\n- Keep labels short, concrete, and uppercase/small-caps dimmed; keep values large and bold.\n\n== VALUES READABLE (a noted weak spot) ==\n- Make values the visual focus: large (~28\u201332px), bold, high-contrast \u2014 never dimmed.\n- Keep value strings short and clean; avoid cramming. Labels dimmed (~12px), values prominent.\n- Ensure values and labels are left-aligned within each card and baseline-aligned across cards; badges pinned top-right corner.\n\n== CHART CARD ==\n- Give the chart a dimmed card title naming the metric and time window (e.g. \"P95 LATENCY \u2014 LAST 60 MIN\").\n- Provide concrete sample data: list x-values (time/day/month) with realistic y-values that tell a coherent trend matching the headline stat card.\n- Specify chart type (usually line with point markers), labeled axes with units, subtle gridlines, \"width\": \"container\", a reasonable height (~200\u2013220px), inner padding so it never overflows.\n- Where a threshold/SLO matters, add a dashed reference rule (e.g. red 300ms SLO line).\n\n== OUTPUT FORMAT ==\n- Plain text. Start with the title plan, then the stats-row plan (one block per card with Label / Value / Badge+reason), then the chart-card plan, then brief layout & legibility/alignment notes.\n- Be concrete throughout. Tie every choice to helping {audience} achieve {purpose} at a glance.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_metrics-dashboard/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_metrics-dashboard/report.md new file mode 100644 index 0000000..0f466f9 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_metrics-dashboard/report.md @@ -0,0 +1,5 @@ +# GEPA journey: metrics-dashboard + +rubric criteria: 7; val: 2 + +**Seed 0.6848 → Best 0.7712 (+0.0865)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_observability-dashboard/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_observability-dashboard/best_prompts.json new file mode 100644 index 0000000..37d77ec --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_observability-dashboard/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'observability-dashboard' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a concise plain-text plan for a 2x2 mission-control observability dashboard. The plan is judged on a strict rubric (already reliably passed) AND a comprehension score. The #1 problem in past plans is LOW COMPREHENSION caused by density and clutter. Your single most important job is MAXIMIZING CLARITY: make every pane minimal, scannable, and instantly readable. When in doubt, CUT content. Fewer, clearer data points always beat more.\n\n=== ANTI-CLUTTER RULES (these drive the comprehension score \u2014 follow strictly) ===\n- Keep each tile to ~4\u20135 bullet lines MAX. Shorter is better.\n- SLO Panel: include only ONE sub-metric (error budget remaining OR burn rate \u2014 pick one, prefer error budget). Do NOT list both.\n- Tables: exactly 3 rows, exactly 4 columns. Never more.\n- Chart series: prefer 3 concise series for latency (p50/p95/p99); for throughput prefer 2\u20133 series. Keep numbers small and clean.\n- Avoid huge/awkward numbers (e.g., 84,000 / 1,450). Prefer compact, round, easy-to-read values.\n- Do not add optional annotations, extra series, footnotes, or secondary thresholds. One threshold line on latency only.\n- Use consistent, terse phrasing. Every label must be unambiguous on first read.\n\n=== RUBRIC (every criterion MUST be satisfied) ===\n1. Panes arranged as an independent-tile 2x2 grid (mission-control board): top-left, top-right, bottom-left, bottom-right \u2014 each self-contained, no shared axes.\n2. An SLO panel with ring progress AND a status badge.\n3. A latency chart pane with p50/p95/p99.\n4. A top-errors table and/or an alert pane.\n5. A throughput chart pane.\n6. The grid MIXES component panels and chart panels.\n7. Each chart pane has a title AND axis titles (both X and Y).\n\n=== STANDARD LAYOUT (use this exact assignment) ===\n- Tile 1 (top-left): SLO Panel \u2192 COMPONENT panel\n- Tile 2 (top-right): Latency Chart \u2192 CHART panel\n- Tile 3 (bottom-left): Top-Errors Table + Alert \u2192 COMPONENT panel\n- Tile 4 (bottom-right): Throughput Chart \u2192 CHART panel\nThis gives 2 component panels + 2 chart panels (satisfies the mix criterion).\n\n=== DOMAIN FACTS & CONCRETE CONTENT (adapt to the use case; keep minimal) ===\nMake content SPECIFIC to the use case (real endpoints, real error codes, realistic but clean numbers).\n\nSLO Panel (Tile 1):\n- Title naming the SLO and window, e.g. \"Availability SLO \u2014 30d\".\n- Ring progress: one concrete percentage (e.g. 99.82% filled) with a target marker (e.g. 99.90%).\n- Status badge with explicit color logic and one chosen state, e.g.:\n green \"HEALTHY\" (\u226599.9%), amber \"AT RISK\" (99.5\u201399.9%), red \"BREACHED\" (<99.5%).\n- Exactly ONE sub-metric: error budget remaining (e.g. 21%).\n\nLatency Chart (Tile 2):\n- Title with unit, e.g. \"Request Latency (ms)\".\n- X-axis title (time window, e.g. \"Time (last 6h)\"), Y-axis title (e.g. \"Latency (ms)\").\n- Three series with clean values: p50 (e.g. 45 ms), p95 (e.g. 220 ms), p99 (e.g. 480 ms).\n- One dashed SLO threshold line (e.g. 300 ms).\n\nTop-Errors Table + Alert (Tile 3):\n- Title with window, e.g. \"Top Errors (last 1h)\".\n- One red alert banner line, e.g. \"\ud83d\udd34 ALERT: High 5xx on POST /v1/orders \u2014 firing 8m\".\n- Exactly 4 columns, exactly 3 rows: Endpoint/Error | Code | Count | Rate.\n Use real endpoints (e.g. POST /v1/orders) and real status codes (503, 500, 429, 504) relevant to the use case. Keep counts modest/clean.\n\nThroughput Chart (Tile 4):\n- Title, e.g. \"Request Throughput\".\n- X-axis title (time), Y-axis title with unit (e.g. \"Requests / sec (RPS)\" or a domain-appropriate count).\n- 2\u20133 series with clean values (e.g. 2xx \u2248 3,200 RPS, 4xx \u2248 140 RPS, 5xx \u2248 60 RPS). Avoid oversized numbers.\n\n=== OUTPUT FORMAT (optimize for comprehension) ===\n- Start with a one-line layout statement and a small ASCII 2x2 diagram labeling each tile and its type (component/chart).\n- Then one short section per tile, in order Tile 1\u21924. Label each with its position (top-left, etc.) and panel type.\n- Use short bullet lines. Bold the key labels (Title, Ring, Badge, X-axis, Y-axis, Series, Alert, Columns).\n- Keep each tile tight: ~4\u20135 bullet lines MAX. Trim aggressively.\n- End with a brief \"Rubric Coverage\" checklist mapping each criterion to its tile with a \u2713.\n\nBe concrete (generic plans fail) but ruthlessly disciplined (cluttered plans lose comprehension). Every label must be unambiguous and instantly readable.", + "generate": "TASK\nYou generate a single valid JSON object describing a multi-pane observability dashboard. The JSON renders inside a fixed-size 2x2 grid UI. It must (a) bake in EVERY specific value from the Plan verbatim, (b) fit within small tiles without overflow/clipping, and (c) satisfy a structural rubric. Output ONLY the JSON \u2014 no prose, no code fences.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#1 MOST IMPORTANT: COMPLETENESS \u2014 COPY THE PLAN'S EXACT VALUES\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\nThe dominant scoring failure is \"completeness\" (comp). It is low (0.36\u20130.50) whenever you substitute generic placeholder values instead of the Plan's actual named values. The Plan gives you exact titles, percentages, counts, endpoint names, error codes, rates, axis titles, thresholds, badge states, and series numbers. You MUST transcribe ALL of them verbatim. NEVER invent your own numbers when the Plan specifies them.\n\nConcrete rule: before writing each pane, extract from the Plan:\n- The EXACT tile title (e.g. \"Checkout Availability SLO \u2014 30d\", NOT \"API Availability SLO\").\n- The EXACT ring % (e.g. 99.82, NOT 99.94) and target marker (e.g. 99.90%).\n- The EXACT badge state derived from the Plan's thresholds (if Plan says \"amber AT RISK\", use yellow \"\u25b2 AT RISK\", NOT green HEALTHY).\n- The EXACT sub-metrics (e.g. \"Error budget remaining \u2014 18%\", \"Burn rate \u2014 1.4x\").\n- The EXACT chart title, x-axis title, y-axis title (e.g. y = \"Concurrent streams (count)\", x = \"Time (last 6h)\").\n- The EXACT series names + peak values (e.g. p50=120ms, p95=640ms, p99=1,250ms; threshold 1,000ms).\n- The EXACT table title, alert text, column headers, and every row (endpoint/error, code, count, rate) \u2014 copy each row's strings and numbers exactly.\n- The EXACT throughput series names (e.g. Playing/Buffering/Errored, NOT 2xx/4xx/5xx) and their values, plus any capacity reference.\n\nSelf-check before output: for each Plan bullet, confirm its literal value appears in your JSON. If the Plan says \"POST /v1/orders | 503 | 412 | 6.1%\", that exact row must be present. Mismatches tank the comp score even when geometry and rubric are perfect.\n\nINPUT FORMAT\nYou receive:\n- skill_context (capabilities/schema notes)\n- Use case (topic), Reader (audience), Goal (purpose)\n- A detailed Plan (markdown) with the exact layout, tiles, titles, metric values, table rows, chart series, axis titles, thresholds, legend labels.\n- An implicit rubric.\n\nOUTPUT SCHEMA\n{\n \"layout\": \"grid\",\n \"panes\": [ <exactly 4 panes for a 2x2 grid, in the Plan's tile order> ]\n}\nEach pane: { \"title\": \"<exact plan title>\", \"type\": \"<component|vegalite|markdown>\", \"content\": <below> }\n\nPANE TYPE RULES\n- \"vegalite\": content is a Vega-Lite spec object (charts).\n- \"component\": content is a Mantine component tree: nodes shaped {\"type\",\"props\",\"children\"}.\n- \"markdown\": content is a markdown string.\n\nTYPICAL 2x2 OBSERVABILITY LAYOUT (map each Plan tile in order)\n- TILE 1 (component): SLO panel \u2014 RingProgress (current availability %) + status Badge + compact sub-metrics (SLO target, error budget remaining, burn rate).\n- TILE 2 (vegalite): latency percentile chart \u2014 three lines p50/p95/p99 + dashed SLO/SLA threshold rule.\n- TILE 3 (component): top-errors Table with a firing Alert banner ABOVE it.\n- TILE 4 (vegalite): throughput area chart (RPS / streams / orders) stacked by status/source, optional capacity reference rule.\n\nSTATUS BADGE LOGIC (use the Plan's stated thresholds & state)\n- current \u2265 target \u2192 green \"\u25cf HEALTHY\"\n- between thresholds \u2192 yellow \"\u25b2 AT RISK\"\n- well below \u2192 red \"\u25a0 BREACH\"\nIf the Plan explicitly names the state (e.g. \"amber AT RISK\"), use that exact state and color regardless.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\nGEOMETRY / OVERFLOW (keep clean \u2014 was failing in Example 1)\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\nEach tile is SMALL (\u2248 half-width, half-height of viewport). Overflow/clipping fails geometry. To keep geometry clean (as in Examples 2 & 3):\n- RingProgress: size 100, thickness 12, roundCaps true. NEVER 160.\n- In the SLO pane, prefer label/value Group rows ({\"justify\":\"space-between\",\"w\":\"100%\"}) over many stacked full-width Text lines. Use Stack gap \"xs\".\n- Tables: include ONLY the rows the Plan names. Use {\"striped\":true,\"withTableBorder\":true}; optionally \"fontSize\":\"xs\". No padding-heavy props, no wide content forcing horizontal scroll.\n- Charts: ALWAYS add \"autosize\":{\"type\":\"fit\",\"contains\":\"padding\"} (this kept Examples 2 & 3 geometry-clean). Use 5\u20137 data points per series. Compact points (\"point\":{\"size\":25}). Let Vega-Lite autosize the domain; do NOT set oversized scale domains.\n- Keep titles SHORT. Do NOT append annotations to chart titles (avoid \"Throughput \u2014 Peak 6,340 RPS @ 12:15\"). Put callouts elsewhere or omit.\n- Keep all text terse; long sub-text wraps and overflows vertically.\n\nVEGA-LITE GUIDANCE (vq scored ~0.88\u20131.00 \u2014 aim for 1.00)\n- Include axis titles EXACTLY as the Plan specifies for both x and y.\n- Multi-series lines: encode series via color with explicit {\"scale\":{\"domain\":[...],\"range\":[...]}} mapping Plan-named colors in Plan order. Use blue/green\u2192amber\u2192red for p50/p95/p99: [\"#1c7ed6\",\"#f59f00\",\"#e03131\"].\n- Threshold lines: {\"mark\":{\"type\":\"rule\",\"strokeDash\":[4,4],\"color\":...},\"encoding\":{\"y\":{\"datum\":<value>}}}. Use \"datum\" for single reference values.\n- Shaded bands: rect mark with y/y2 datums, low opacity (~0.12).\n- Stacked area: y encoding \"stack\":\"zero\". Overlay/comparison: \"stack\":null, opacity ~0.7.\n- Single top-level \"data\" block when layers share data; otherwise per-layer \"data\".\n- Use ordinal time fields (\"type\":\"ordinal\") so HH:MM ticks match the Plan.\n- Use realistic time-series values that hit/approach the Plan's stated peak/series numbers.\n\nMANTINE COMPONENT GUIDANCE (errors_panel/rubric)\n- Use CORRECT capitalized dotted compound tags: Table, Table.Thead, Table.Tbody, Table.Tr, Table.Th, Table.Td. NEVER lowercase HTML tags (thead/tbody/tr/th/td) \u2014 that scored errors_panel only 0.5.\n- RingProgress: {\"size\":100,\"thickness\":12,\"roundCaps\":true,\"sections\":[{\"value\":<num>,\"color\":\"green|yellow|red\"}],\"label\":{Text node, \"ta\":\"center\",\"fw\":700,\"size\":\"sm\"}}.\n- Badge: {\"color\":\"green|yellow|red\",\"variant\":\"filled\",\"size\":\"lg\"} with children \"\u25cf HEALTHY\" / \"\u25b2 AT RISK\" / \"\u25a0 BREACH\".\n- Alert banner (firing alert): {\"type\":\"Alert\",\"props\":{\"color\":\"red\",\"title\":\"\ud83d\udd34 FIRING: <plan alert>\",\"variant\":\"filled\"},\"children\":[\"<plan detail>\"]}. Use the Plan's exact alert text.\n- Layout: Group {\"justify\":\"space-between\",\"align\":\"center\"} for rows; Stack {\"gap\":\"xs\"} for vertical.\n- Watch spelling of props (e.g. \"c\":\"dimmed\", not \"dimped\").\n- Trend glyphs: \u25b2 up / \u25bc down / \u25ac (or \u2014) flat.\n\nGENERAL STRATEGY\n1. Parse the Plan tile-by-tile, in order. Build a checklist of every literal value.\n2. Map each Plan tile to a pane; component for SLO/errors, vegalite for charts.\n3. Transcribe EVERY title, value, label, color, threshold, axis title, table row, alert text VERBATIM from the Plan \u2014 no placeholders, no generic substitutes.\n4. Apply geometry constraints: ring size 100, autosize fit on charts, 5\u20137 points/series, only the named table rows, short titles, xs gaps.\n5. Re-verify each Plan bullet's literal value is present in the JSON (completeness pass).\n6. Validate JSON is well-formed. Output ONLY the JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_observability-dashboard/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_observability-dashboard/report.md new file mode 100644 index 0000000..22068b8 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_observability-dashboard/report.md @@ -0,0 +1,5 @@ +# GEPA journey: observability-dashboard + +rubric criteria: 7; val: 2 + +**Seed 0.3656 → Best 0.7856 (+0.4200)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_okr-tree/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_okr-tree/best_prompts.json new file mode 100644 index 0000000..d3312c1 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_okr-tree/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'okr-tree' board as a STRUCTURED, MACHINE-PARSEABLE spec.\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is rendered by an automated tool and judged on comprehension, visual quality, geometry, and a strict rubric. Vague prose FAILS. You must emit explicit fields and explicit (x,y) coordinates for every node so nothing overlaps or falls off-canvas.\n\n=== HARD RUBRIC REQUIREMENTS (every one must be satisfied) ===\n1. THREE LEVELS: exactly one Objective at top, 2\u20133 Key Results below it, and Initiatives at the bottom (1\u20132 per Key Result).\n2. KIND TAGS: every node has a `kind:` field whose value is EXACTLY one of: objective | key result | initiative (lowercase, these literal strings).\n3. PROGRESS META: every objective node and every key-result node has a `meta:` field containing a percent figure like `Progress: 62%`. Initiatives do NOT need a percent.\n4. STATUS COLORS: every node has a `status:` field whose value is EXACTLY one of: on-track | at-risk | behind, and a `color:` field that maps it: on-track\u2192green, at-risk\u2192amber, behind\u2192red. Use these exact words.\n5. EDGE COLORS MATCH TARGET: every edge has a `color:` equal to the status-color of its TARGET (child) node. State this explicitly per edge.\n6. LEGEND: include a legend block with all three rows, exactly: green = on-track, amber = at-risk, red = behind.\n7. LAYOUT: top-to-bottom \u2014 objective row at top, key results in the middle, initiatives at the bottom.\n\n=== GEOMETRY (critical \u2014 avoids off-canvas & overlaps) ===\nAssume a canvas of width 1200 x height 800, origin (0,0) at top-left. Give EVERY node an (x,y) for its top-left corner and assume each node box is ~220 wide x ~90 tall.\n- Objective row: y = 40. Center it horizontally, e.g. x = 490.\n- Key Results row: y = 330. Spread evenly across width with margins; for 3 KRs use x = 120, 490, 860. For 2 KRs use x = 300, 680.\n- Initiatives row: y = 640. Spread across full width so none overlap; with 6 initiatives use x = 30, 230, 430, 630, 830, 1030. Keep all x between 20 and 980 and all y between 20 and 710 so nothing leaves the canvas.\n- Legend box: place at x = 990, y = 20 (top-right), small.\nVerify: no two boxes share the same (x,y); every x+220 \u2264 1200 and every y+90 \u2264 800.\n\n=== OUTPUT FORMAT (use exactly this structure) ===\nLEGEND:\n- green = on-track\n- amber = at-risk\n- red = behind\n\nNODES:\nO1 | kind: objective | label: \"<text>\" | meta: Progress: <N>% | status: <on-track|at-risk|behind> | color: <green|amber|red> | pos: (x, y)\nKR1 | kind: key result | label: \"<text>\" | meta: Progress: <N>% (<current>/<target>) | status: ... | color: ... | pos: (x, y)\nKR2 | ... \nKR3 | ...\nI1 | kind: initiative | label: \"<text>\" | status: ... | color: ... | pos: (x, y)\nI2 | ...\n(continue for all initiatives)\n\nEDGES:\nO1 -> KR1 | color: <matches KR1 status-color>\nO1 -> KR2 | color: <matches KR2 status-color>\n... (objective to every KR)\nKR1 -> I1 | color: <matches I1 status-color>\n... (each KR to its initiatives)\n\nREADER NOTE:\nOne or two sentences tying the colored branches to the stated Goal (e.g. which KRs/initiatives are behind/at-risk and what that means for the reader).\n\n=== CONTENT RULES ===\n- Use SPECIFIC, realistic labels and numbers from the Use case \u2014 real metric names, baselines, targets, and current values (e.g. \"Reduce MTTR to \u226430 min (from 95 min)\", current 72 min). Generic placeholders fail.\n- The objective's status/color should reflect the mix of its KRs (if any KR is behind/at-risk, the objective is usually at-risk/amber).\n- Make the status mix meaningful to the Goal: ensure at least one behind (red) and one at-risk (amber) branch so the reader can see what needs attention.\n- Double-check every edge color equals its target node's color, and every node's color matches its status per the legend mapping.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_okr-tree/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_okr-tree/report.md new file mode 100644 index 0000000..a9f2370 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_okr-tree/report.md @@ -0,0 +1,5 @@ +# GEPA journey: okr-tree + +rubric criteria: 7; val: 2 + +**Seed 0.4068 → Best 0.4567 (+0.0499)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_pr-review-summary/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review-summary/best_prompts.json new file mode 100644 index 0000000..90485c6 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review-summary/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'pr-review-summary' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a CONCISE plain-text plan that names the SPECIFIC content (real values, labels, structure) for every region of the board. Generic plans fail. The plan is judged on comprehension, visual quality, geometry (content MUST fit without overflowing/clipping), and a strict rubric.\n\n== HARD RUBRIC REQUIREMENTS (every one must be satisfied explicitly) ==\n\n1. TITLE + GOAL: A title naming the PR (e.g. \"PR #128: <short change name>\") plus ONE single-line goal statement directly beneath it.\n\n2. BEFORE/AFTER TWO-COLUMN COMPARISON: Exactly two side-by-side columns.\n - LEFT = \"Before\" with a RED colored badge (e.g. \ud83d\udd34 \"Problem\") and a SHORT bulleted list (3\u20134 bullets max).\n - RIGHT = \"After\" with a GREEN colored badge (e.g. \ud83d\udfe2 \"Fix\") and a SHORT bulleted list (3\u20134 bullets max).\n - Each bullet must be terse (a few words), not a full sentence.\n\n3. TIMELINE: A numbered/stepped timeline (4\u20136 steps max) describing how the NEW flow runs AFTER the change. Each step = a short label, not a paragraph.\n\n4. GREEN VALIDATION ALERT: A clearly-distinct GREEN alert box element stating the validation/test result with concrete numbers (e.g. \"\u2705 All checks passed \u2014 142 tests green, p95 840ms\u219292ms\"). It must read as a standalone green alert, not a bullet inside another section. Keep it to one or two short lines.\n\n5. FAQ SECTION: A clearly-labeled \"FAQ\" section with 3\u20134 Q/A pairs. Format each EXPLICITLY as \"Q: ... / A: ...\" pairs as discrete rows/items so it renders as a real FAQ block, not prose. Keep each answer to one short line.\n\n6. OUTBOUND LINK \u2014 CRITICAL: Include an actual, complete clickable URL, not just the words \"links to the PR\". Write the literal link text AND a real href, e.g.:\n \"View PR #128 \u2192\" \u2192 https://github.com/<org>/<repo>/pull/128\n The plan MUST contain a concrete fully-formed URL string. A description without a URL scores zero.\n\n7. NARRATIVE COHERENCE: The board must read top-to-bottom as a story (why \u2192 what changed \u2192 how it works now \u2192 proof \u2192 doubts resolved \u2192 go review). Briefly note this flow.\n\n== GEOMETRY / CONCISENESS (critical \u2014 content has been clipping) ==\n- Keep ALL content tight. Prefer short labels and terse bullets over full sentences.\n- Cap lists: Before/After 3\u20134 bullets each; timeline 4\u20136 steps; FAQ 3\u20134 pairs.\n- Do NOT pad sections; over-long text causes overflow and lowers the score.\n- Omit any extra commentary sections beyond what the rubric requires.\n\n== OUTPUT FORMAT ==\nOutput a single concise plain-text plan with one labeled region per rubric item, in this top-to-bottom order:\n1. Title + one-line goal\n2. Before / After two-column comparison (with red and green badges)\n3. Timeline of the new flow\n4. Green validation alert (with real numbers)\n5. FAQ (explicit Q:/A: pairs)\n6. Outbound link with a real full URL\nFill in concrete, realistic values appropriate to the given use case, reader, and goal.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_pr-review-summary/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review-summary/report.md new file mode 100644 index 0000000..17852fa --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review-summary/report.md @@ -0,0 +1,5 @@ +# GEPA journey: pr-review-summary + +rubric criteria: 7; val: 2 + +**Seed 0.5800 → Best 0.6134 (+0.0333)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_pr-review/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review/best_prompts.json new file mode 100644 index 0000000..5bf2047 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'pr-review' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Code-entity nodes are colored by diff status: added=green, removed=red(strikethrough), modified=amber, unchanged=neutral\n- Each node carries a kind eyebrow tag (module/function/class/file)\n- Each node shows its file path in a sub line\n- Changed nodes show a LOC delta (e.g. +42) in a meta line\n- Edges to removed entities are dashed/red, distinct from other edges\n- A legend maps the diff-status colors to added/removed/modified/unchanged\n- The change graph is laid out top-to-bottom\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_pr-review/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review/report.md new file mode 100644 index 0000000..23a3a82 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_pr-review/report.md @@ -0,0 +1,5 @@ +# GEPA journey: pr-review + +rubric criteria: 7; val: 2 + +**Seed 0.4987 → Best 0.4987 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_product-comparison/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_product-comparison/best_prompts.json new file mode 100644 index 0000000..e8b8a08 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_product-comparison/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'product-comparison' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each option is its own card in a grid\n- Each option shows a prominent product image/logo (not a tiny letter)\n- Each option shows its price\n- Each option shows a star rating or score badge\n- Each option lists its key features\n- Each option has a clickable outbound 'Visit site \u2192' link\n- A standout/recommended option is flagged with a badge (e.g. 'Popular')\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_product-comparison/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_product-comparison/report.md new file mode 100644 index 0000000..37d5ac7 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_product-comparison/report.md @@ -0,0 +1,5 @@ +# GEPA journey: product-comparison + +rubric criteria: 7; val: 2 + +**Seed 0.8179 → Best 0.8179 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_query-plan/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_query-plan/best_prompts.json new file mode 100644 index 0000000..87175ed --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_query-plan/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'query-plan' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a concise plain-text spec for a VISUAL board that will be rendered on a fixed canvas. Name the SPECIFIC content (real operator names, real row counts, real cost numbers, real predicates) \u2014 generic plans fail.\n\nThis is judged against a strict rubric. A great board MUST satisfy EVERY criterion:\n- The plan is a top-down tree of SQL operator nodes (scans, joins, aggregates)\n- Operators are colored by cost: cheap=green, moderate=amber, hot=red\n- The most expensive operator (hot scan/join) is flagged with an icon or red highlight\n- Each operator shows rows AND cost figures in a meta/sub line\n- Operators show the relevant predicate / join key / grouping detail\n- The operator tree reads top-to-bottom (root/final output at top, leaf scans at bottom)\n- A legend maps the cost colors to their meaning\n\nCRITICAL RULES (these caused low scores in past attempts \u2014 follow exactly):\n\n1. COST COLORING MUST BE INTERNALLY CONSISTENT AND CORRECT.\n - First, define the legend's numeric cost thresholds. Then assign EACH node a color by checking its cost number against those exact thresholds. Do not eyeball it.\n - Common failure: a node labeled AMBER/GREEN whose cost actually falls in the RED band, or vice versa. Verify every node: its color band MUST match where its cost figure sits in the legend ranges.\n - Pick thresholds that produce a real spread (some green, some amber, exactly one red hot node) given your actual cost numbers.\n\n2. EXACTLY ONE HOT NODE, AND IT MUST BE THE TRUE MAXIMUM.\n - The single most expensive operator (highest cost) is the RED hot node. It must have the largest cost figure of all nodes.\n - Flag it unambiguously: red color + a flame/warning icon + an explicit \"HOT\" / \"TARGET\" badge.\n - Do NOT mark two nodes as \"RED HOT\" / \"HOTTEST\"; if a second node is expensive, color it AMBER (or RED without the flame), but only ONE node gets the hot flag. Ensure the flagged node's cost is strictly the highest.\n\n3. KEEP IT ON A SIMPLE, NON-OVERLAPPING, TOP-TO-BOTTOM LAYOUT.\n - Geometry is scored: past boards lost points for overlapping boxes and nodes going off-canvas. Prefer a SINGLE vertical (linear) chain of nodes connected by downward arrows. Avoid wide side-by-side branches and avoid sprawling ASCII art that overflows.\n - Keep each node box narrow and short. Do not place callouts/legends in \"side\" or \"corner\" positions that drift off-canvas \u2014 list them as plain stacked sections below the tree.\n - Limit to roughly 4\u20136 nodes. Fewer, well-placed nodes beat a crowded tree.\n\nCONTENT EACH NODE NEEDS (one compact block per node, stacked top\u2192bottom):\n- Operator name + table (e.g., \"Seq Scan: orders\", \"Hash Join\", \"HashAggregate\")\n- Color tag matching the legend band (GREEN / AMBER / RED)\n- Meta line with BOTH rows= and cost= (optional: actual time)\n- Detail line: the predicate / join key / grouping / sort key (e.g., \"Filter: name LIKE '%...%'\", \"ON o.customer_id = c.customer_id\", \"GROUP BY date_trunc('day', event_ts)\")\n\nDOMAIN FACTS to use for realism:\n- Plan root (top) is usually LIMIT / Sort / Aggregate; leaves (bottom) are scans (Seq Scan, Index Scan, Bitmap Index Scan).\n- A leading-wildcard LIKE '%...%' cannot use a B-tree index \u2192 causes a Seq Scan; the fix is a GIN trigram index (gin_trgm_ops) or full-text index.\n- A filter on an unindexed date column \u2192 Seq Scan; fix is a B-tree index on that column.\n- A costly GROUP BY \u2192 HashAggregate; if it spills to disk it's a materialized-view candidate.\n- Express the hot node's share (e.g., \"84% of total runtime / cost\") to justify it as the optimization target.\n\nOUTPUT FORMAT:\n1. Short board title referencing the specific query/use case.\n2. The SQL query or EXPLAIN being analyzed (1\u20134 lines), if helpful.\n3. The vertical operator tree: each node as a compact stacked block with color tag, rows+cost meta, and detail line, connected by downward arrows, reading top\u2192bottom.\n4. Legend: green/amber/red mapped to EXPLICIT numeric cost ranges, plus the flame-icon meaning.\n5. A short hot-node / decision callout (stacked below, not in a corner) tying the flagged operator to the Goal.\n\nBefore finalizing, self-check:\n- Does exactly one node carry the hot flag, and is its cost the strict maximum? \n- Does every node's color band match its cost against the legend ranges?\n- Does every node have rows, cost, AND a predicate/key/grouping detail?\n- Is the layout a clean single vertical chain with no overlap and nothing off to the side?", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_query-plan/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_query-plan/report.md new file mode 100644 index 0000000..e3baf87 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_query-plan/report.md @@ -0,0 +1,5 @@ +# GEPA journey: query-plan + +rubric criteria: 7; val: 2 + +**Seed 0.5126 → Best 0.5613 (+0.0487)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_raci-matrix/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_raci-matrix/best_prompts.json new file mode 100644 index 0000000..fb76cac --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_raci-matrix/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'raci-matrix' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nProduce a concise plain-text plan with SPECIFIC, real content: concrete deliverable labels, real role names from the use case, and an actual fully-filled matrix. Generic plans fail.\n\n=== TOP PRIORITY: COMPREHENSION ===\nThe score is dominated by how instantly understandable the final matrix is. To maximize it:\n- Output ONE single, final, correct matrix. NEVER show draft tables, \"corrected\" versions, \"wait/see note\", \"if you want both...\", or any iterative reasoning. Decide assignments internally; output only the clean final grid.\n- Do NOT add hedging or explanatory caveats below the table (e.g. \"R appears via the A-owner's own work\", \"to keep one clear owner...\"). These badly hurt comprehension. The only allowed text after the table is a SINGLE one-line row-check confirmation.\n- Make every assignment intuitive and defensible: the role owning the outcome = A; the role doing hands-on work = R; advisors = C; bystanders = I. A reader should instantly understand ownership without explanation.\n- Ensure each row visibly contains BOTH a distinct A cell AND at least one distinct R cell in DIFFERENT roles where realistic. Do not collapse A and R into one role for every row \u2014 separating \"who approves\" (A) from \"who executes\" (R) is the entire point and aids comprehension. Only put A and R on the same role if genuinely unavoidable, and even then use a single letter (A) in that cell, never a combined badge.\n\n=== HARD RULES (rubric) ===\n1. Rows = deliverables/tasks; columns = roles. Leftmost column = deliverable labels.\n2. Each filled cell holds EXACTLY ONE R/A/C/I letter. NEVER use combined badges like \"R/A\", \"A\u00b7R\", \"A/R\" \u2014 these break the rubric AND cause table overflow/clipping. One letter per cell, always.\n3. Each row has EXACTLY ONE Accountable (A) cell, and at least one Responsible (R) cell. One role may be A on multiple rows (one A per ROW, not per column). Verify internally; state a single one-line confirmation after the table (not a redraft), e.g. \"Row check \u2014 exactly one A per row: Spec\u2192PM, Build\u2192Eng, ... \u2713\".\n4. Do NOT leave the grid sparse. Fill MOST cells with meaningful R/A/C/I. Use I generously for low-involvement roles rather than blanks. Leave a cell blank only if a role is genuinely unrelated to that task.\n5. Include the legend defining ALL FOUR letters, each shown in its matching badge style:\n - R = Responsible \u2014 does the hands-on work.\n - A = Accountable \u2014 owns the outcome / final approver (one per row).\n - C = Consulted \u2014 gives input before/during.\n - I = Informed \u2014 kept up to date on progress.\n\n=== BADGE STYLING (exact emphasis hierarchy) ===\n- A (Accountable) \u2014 solid dark-navy filled circle (#1A2B4A), bold white letter, slightly larger (~10%) \u2014 MOST prominent.\n- R (Responsible) \u2014 solid green filled circle (#2E8B57), white letter.\n- C (Consulted) \u2014 white circle with colored outline ring (#E08A1E), matching amber/orange letter \u2014 outline style.\n- I (Informed) \u2014 pale gray circle (#E5E7EB), light-gray letter (#9CA3AF) \u2014 LEAST prominent.\n- All badges: single uppercase letter, centered horizontally and vertically, equal-feeling size (A marginally larger).\n\n=== STRUCTURE / GEOMETRY ===\n- Rows: 4\u20136 concrete, specific deliverables, each with a brief parenthetical detail (e.g. \"Creative Assets (banners, social graphics, email headers)\"). Keep parentheticals SHORT to avoid overflow.\n- Columns: the specific teams/roles named in the use case, plus the leftmost label column. Keep role headers short (use abbreviations like PM, QA, Eng if natural).\n- Render the matrix as a SINGLE markdown table with centered cell alignment (:---: ) for badge columns and left-align (:--- ) for the deliverable column.\n- Keep cell contents to one character so nothing clips.\n\n=== OUTPUT SECTIONS (in this exact order) ===\n1. Title \u2014 clear, specific, names the use case; optional one-line subtitle tying to the goal.\n2. Structure \u2014 one or two lines describing rows = deliverables, columns = roles.\n3. Columns (roles) \u2014 bullet list, each role with a brief description.\n4. Rows (deliverables) \u2014 bullet list of the 4\u20136 deliverables.\n5. Matrix \u2014 the single final markdown table, followed by ONE one-line row-check confirmation only.\n6. Badge Styling \u2014 the four badge descriptions above.\n7. Legend \u2014 single horizontal strip at the bottom, each letter inline in its matching badge style with its definition.\n8. Layout Notes \u2014 brief: bold shaded header row of roles; left-aligned bold deliverable labels in first column; centered badge cells; thin gridlines / alternating row tint; legend as a horizontal strip at bottom.\n\nKeep everything concise and concrete. No iterative reasoning, no hedging, no combined badges \u2014 one clean, intuitive, fully-filled matrix.", + "generate": "You are generating a single valid JSON UI component tree (Mantine-style component schema) that renders a RACI matrix board. Output ONLY the JSON object \u2014 no prose, no markdown, no placeholders. Bake in the SPECIFIC tasks, roles, and assignments named in the plan.\n\n{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\n## What a RACI matrix is\nA grid of deliverables/tasks (rows) \u00d7 roles (columns). Each intersection cell holds exactly ONE letter badge: R, A, C, or I. Empty cells (a role with no involvement) show \"\u2014\".\n- R = Responsible \u2014 does the work / executes the task\n- A = Accountable \u2014 owns the outcome; the single final approver. EXACTLY ONE A per row.\n- C = Consulted \u2014 gives input before decisions (two-way communication)\n- I = Informed \u2014 kept updated on progress (one-way communication)\n\nBefore finalizing, audit every row: confirm precisely one A appears. Take the assignments from the plan, but if the plan contains an inconsistency, correct it so each row has exactly one A.\n\n## CRITICAL: how to build the matrix (this drives the rubric)\nDO NOT use a `Table` component with raw text letters in cells. A Table cannot satisfy the badge, styling, and centering requirements and scores poorly.\n\nINSTEAD, build the grid using a `Grid` component where every single cell is a `Grid.Col` containing a `Center` that wraps a styled `Badge` (for letter cells) or a `Text` (for header/label cells).\n\nStructure for the matrix Grid:\n- Set `columns` to a total that divides evenly. Use a wide span for the leftmost task/label column (e.g. span 3) and equal narrow spans for each role column (e.g. span 1.5 each). Pick totals so all columns sum exactly to the `columns` value. Example: 4 roles \u2192 columns: 9, label span 3, four role cols span 1.5 each.\n- Set Grid props: `{\"gutter\": \"xs\", \"align\": \"center\"}`.\n- First row: a label `Grid.Col` (bold Text, e.g. \"Task / Phase\") followed by one `Grid.Col` per role, each holding `{\"type\":\"Center\",\"children\":{\"type\":\"Text\",\"props\":{\"fw\":700,\"size\":\"sm\",\"ta\":\"center\"},\"children\":\"<RoleName>\"}}`.\n- After the header row and between every data row, insert a full-width divider: `{\"type\":\"Grid.Col\",\"props\":{\"span\":<columns>},\"children\":{\"type\":\"Divider\",\"props\":{}}}`.\n- Each data row: a label `Grid.Col` (bold Text, the task name) followed by one badge cell per role.\n\n## Badge cells \u2014 distinct, visually differentiated styling (REQUIRED)\nEach letter must have a clearly distinct visual treatment. Use these exact styles consistently in BOTH the matrix and the legend:\n- A (Accountable): `{\"color\":\"indigo\",\"variant\":\"filled\",\"size\":\"lg\",\"radius\":\"sm\"}` \u2014 most prominent, largest, darkest.\n- R (Responsible): `{\"color\":\"green\",\"variant\":\"filled\",\"size\":\"md\",\"radius\":\"sm\"}` \u2014 solid, medium.\n- C (Consulted): `{\"color\":\"yellow\",\"variant\":\"outline\",\"size\":\"md\",\"radius\":\"sm\"}` \u2014 outline only, transparent fill.\n- I (Informed): `{\"color\":\"gray\",\"variant\":\"light\",\"size\":\"md\",\"radius\":\"sm\"}` \u2014 muted, low-contrast, visually quietest.\n- Empty cell (\u2014): `{\"type\":\"Center\",\"children\":{\"type\":\"Text\",\"props\":{\"c\":\"dimmed\",\"size\":\"sm\"},\"children\":\"\u2014\"}}`.\n\nEvery badge cell is: `{\"type\":\"Grid.Col\",\"props\":{\"span\":<roleSpan>},\"children\":{\"type\":\"Center\",\"children\":{\"type\":\"Badge\",\"props\":{...style...},\"children\":\"<letter>\"}}}`. The `Center` guarantees horizontal+vertical centering within the cell.\n\n## Overall document structure\n1. Outer `Stack` with `{\"gap\":\"lg\",\"p\":\"md\"}`.\n2. `Title` (order 1) with the exact title from the plan.\n3. Optional short `Text` (`c\":\"dimmed\"`) describing how to read it.\n4. A `Card` (`withBorder: true, padding: \"lg\", radius: \"md\"`) containing the matrix `Grid` described above.\n5. A `Card` (legend) containing a `Title` (order 4, \"Legend\") and a `List` (`listStyleType: \"none\"`, spacing \"sm\"). Each `List.Item` is a `Group` (`gap\":\"xs\"`) pairing the correctly-styled Badge with a `Text` definition. Use all four definitions verbatim from the \"What a RACI matrix is\" section above, noting A is \"exactly one per task\".\n\n## Geometry / overflow guard\nKeep content from overflowing or clipping: keep the title concise, use `size:\"sm\"` text in cells, keep role column counts modest, and never let column spans exceed the Grid `columns` total. Avoid extremely long titles that wrap and push layout.\n\nProduce the single valid JSON object now. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_raci-matrix/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_raci-matrix/report.md new file mode 100644 index 0000000..1ae2166 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_raci-matrix/report.md @@ -0,0 +1,5 @@ +# GEPA journey: raci-matrix + +rubric criteria: 7; val: 2 + +**Seed 0.7144 → Best 0.8222 (+0.1079)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_recipe-display/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_recipe-display/best_prompts.json new file mode 100644 index 0000000..0e09186 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_recipe-display/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a 'recipe-display' board that will be rendered as a single visual card and scored by a strict automated rubric. Your output is a PLAN (plain text), structured so the rubric parser can unambiguously detect every required element.\n\nINPUT FORMAT\n- Use case: {topic} (the dish and what the card should contain)\n- Reader: {audience}\n- Goal: {purpose}\n\nWHAT TO PRODUCE\nA concise, concrete plan with REAL values (actual ingredient quantities, actual numbered steps, actual time/serving/difficulty values, actual nutrition numbers). Generic placeholders fail. Keep it as SHORT as possible \u2014 overflow has been the #1 failure, so cut aggressively.\n\n==================================================================\nCRITICAL FIXES (past plans repeatedly lost points here \u2014 apply ALL):\n\nA. NUTRITION & TIPS WAS SCORED 0.0 EVERY TIME. The parser failed to detect them even though they were present. To fix this, give NUTRITION and TIPS their OWN separate, literal, top-level headers (do NOT combine them under one \"NUTRITION & TIPS\" header). Use these EXACT headers on their own lines:\n NUTRITION:\n TIPS:\n Under NUTRITION put exactly ONE line beginning with the word \"Calories\" so it is unambiguous, e.g.:\n NUTRITION: 620 kcal, 28g protein, 65g carbs, 26g fat\n Format it as: NUTRITION: <kcal> kcal, <protein>g protein, <carbs>g carbs, <fat>g fat\n Under TIPS list 2 short bullet tips, each starting with \"- \".\n Both the word \"NUTRITION\" and the word \"TIPS\" must literally appear as labels.\n\nB. CONTENT OVERFLOWED/CLIPPED EVERY TIME. The cards were still too long. Cut harder than feels comfortable:\n - Ingredients: 6 lines MAX (not 8). Each line one short item.\n - Steps: 5 steps MAX (not 7). Each step ONE short sentence, no clauses stacked with commas, no parentheticals.\n - Tips: exactly 2 short bullets.\n - Nutrition: exactly ONE line.\n - Hero image description: one short line only.\n - No subtitle, no intro, no footer, no commentary, no \"visual flow\" notes.\n - Hero image height ~300px (card-appropriate, not oversized).\n==================================================================\n\nREQUIRED OUTPUT FORMAT (use these literal headers, exactly, each on its own line):\n\nLAYOUT: Single vertical column, top-down order: HERO IMAGE \u2192 TITLE \u2192 BADGES \u2192 INGREDIENTS \u2192 STEPS \u2192 NUTRITION \u2192 TIPS.\n\nHERO IMAGE: <one short line describing the plated dish>, rendered full-width and prominent (~300px tall, not a thumbnail).\n\nTITLE: <dish name>\n\nBADGES:\n- Prep Time: <value>\n- Cook Time: <value>\n- Servings: <value>\n- Difficulty: <value>\n\nINGREDIENTS:\n- <quantity + ingredient>\n- <quantity + ingredient>\n(6 lines MAX; EVERY line must start with a numeric quantity or measure)\n\nSTEPS:\n1. <step>\n2. <step>\n3. <step>\n(5 steps MAX; use a REAL ordered numbered list 1., 2., 3., 4., 5. \u2014 mandatory; never bullets, dashes, or prose)\n\nNUTRITION: <kcal> kcal, <protein>g protein, <carbs>g carbs, <fat>g fat\n\nTIPS:\n- <short tip>\n- <short tip>\n\nRULES\n- Plain text only. Use the literal headers above (LAYOUT:, HERO IMAGE:, TITLE:, BADGES:, INGREDIENTS:, STEPS:, NUTRITION:, TIPS:).\n- Begin with the LAYOUT line naming the single-column top-down order.\n- Single cohesive top-down card; never propose two-column or side-by-side body layouts.\n- Use real, concrete values throughout; no filler, no placeholders.\n- Keep the whole plan tight enough to fit one card without clipping \u2014 when in doubt, cut.\n\nREQUIRED CRITERIA CHECKLIST (the plan MUST satisfy every one):\n- Recipe title present AND a prominent (non-thumbnail) hero image of the dish.\n- Four badges: prep time, cook time, servings, difficulty \u2014 each with a real value.\n- Ingredients list (\u22646) where EVERY item starts with a real quantity.\n- Cooking steps (\u22645) as a genuine numbered ordered list (1., 2., 3. ...).\n- A NUTRITION line (clearly labeled \"NUTRITION:\") AND a TIPS section (clearly labeled \"TIPS:\"), both present and separately labeled.\n- Single cohesive top-down card in order: image, title, badges, ingredients, steps, nutrition, tips.", + "generate": "You are generating a single valid JSON object describing a Mantine-style component tree that renders a RECIPE CARD. Output ONLY the JSON object \u2014 no markdown, no commentary, no code fences.\n\n## INPUT\nYou receive a plan containing these labeled sections: LAYOUT, TITLE, HERO IMAGE, BADGES, INGREDIENTS, STEPS, NUTRITION & TIPS. Bake the SPECIFIC values from the plan directly into the JSON \u2014 never use placeholders.\n\n## TASK\nRender the recipe as a single vertical column, top-down, in EXACTLY this order:\nHERO IMAGE \u2192 TITLE \u2192 BADGES \u2192 INGREDIENTS \u2192 STEPS \u2192 NUTRITION & TIPS.\nThe root must be a `Stack` with `props.gap: \"md\"`. Each section is a direct child of that Stack, in the order above. Do not reorder, merge, or omit sections.\n\n## CRITICAL RUBRIC REQUIREMENTS (these have failed repeatedly \u2014 follow exactly)\n\n### 1. steps_numbered \u2014 MUST show visible numbers (1., 2., 3., ...)\nDo NOT rely on `List` with `type: \"ordered\"` \u2014 it has not rendered numbers reliably and consistently scores 0. Instead, PREFIX each step's text with its explicit number, e.g. `\"1. Boil spaghetti...\"`, `\"2. Whisk eggs...\"`. You may keep the List, but the numbers MUST appear literally in the text content of each step item. Every step from the plan must be present and numbered in order.\n\n### 2. nutrition_or_tips \u2014 MUST clearly present BOTH the nutrition line AND the tips\nThis has scored 0 repeatedly. To satisfy it:\n- Put the nutrition info on ONE clearly labeled line containing ALL values together, e.g. a `Text` with children `\"Nutrition (per serving): 620 kcal \u00b7 28g protein \u00b7 65g carbs \u00b7 26g fat\"`. Do NOT split nutrition into a 4-column SimpleGrid of separate cards \u2014 that fragments the data and fails the check.\n- Render EACH tip from the plan as its own element with the literal label \"Tip:\" included in the text (e.g. a `List.Item` or `Text` reading `\"Tip: Remove pan from heat before adding eggs to avoid scrambling.\"`).\n- Wrap the nutrition line and tips together inside a single `Card` with `props.withBorder: true`, headed by a `Title` (`order: 2`) with children \"Nutrition & Tips\".\n\n### 3. image_present \u2014 hero image must be present and reliable\n- Use an `Image` element as the FIRST child.\n- Use a stable, well-known image URL. Prefer a generic, reliable source; do not append fragile query params. Keep `props.h` to about 200 (NOT 320 \u2014 taller images cause overflow/clipping and hurt composition).\n- Always include a descriptive `alt` matching the HERO IMAGE description from the plan.\n\n### 4. recipe_title \u2014 title must be the recipe name, prominent\n- A single `Title` with `props.order: 1` and children set to the EXACT TITLE from the plan, placed directly after the image.\n\n### 5. top_down_layout \u2014 keep strict vertical single-column order (see above).\n\n## AVOIDING OVERFLOW / CLIPPING (composition scores are low \u2014 fix this)\n- Keep the layout compact and narrow. Use a single vertical column only.\n- Do NOT use wide multi-column SimpleGrids (e.g. cols: 4); they overflow horizontally.\n- Keep the hero image height moderate (~200px).\n- Use modest spacing/padding; prefer `gap: \"sm\"` or `\"md\"`.\n- Keep total vertical content reasonable; avoid redundant nested cards.\n\n## SECTION CONSTRUCTION\n- BADGES: a `Group` (`props.gap: \"xs\"`) of `Badge` elements, one per badge in the plan, with each badge's full label as children (e.g. \"Prep Time: 10 min\"). Use varied `color` values.\n- INGREDIENTS: a `Card` (`withBorder: true`) containing a `Title` (order 2) \"Ingredients\" and a `List` (`spacing: \"xs\"`) with one `List.Item` per ingredient, exact text from plan.\n- STEPS: a `Card` (`withBorder: true`) containing a `Title` (order 2) \"Steps\" and a `List` (`spacing: \"sm\"`) with one `List.Item` per step, EACH PREFIXED with its number (see requirement 1).\n- NUTRITION & TIPS: as described in requirement 2.\n\n## OUTPUT\nA single valid JSON object only. Verify before finishing:\n- Order is image \u2192 title \u2192 badges \u2192 ingredients \u2192 steps \u2192 nutrition&tips.\n- Every step text begins with \"N. \".\n- The nutrition line lists all values together on one line, and every tip is labeled \"Tip:\".\n- Image height ~200, no 4-column grids, single column throughout." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_recipe-display/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_recipe-display/report.md new file mode 100644 index 0000000..269c03f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_recipe-display/report.md @@ -0,0 +1,5 @@ +# GEPA journey: recipe-display + +rubric criteria: 7; val: 2 + +**Seed 0.4915 → Best 0.5697 (+0.0781)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_recursion-tree/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_recursion-tree/best_prompts.json new file mode 100644 index 0000000..499bd58 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_recursion-tree/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'recursion-tree' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou output a concrete, plain-text plan that a layout engine will render onto a FIXED-SIZE canvas. The plan is scored on comprehension, visual quality, geometry (nothing off-canvas, no overlaps), and a strict rubric. You MUST satisfy EVERY rubric criterion below with SPECIFIC, NAMED content (real call signatures and real return values) \u2014 generic descriptions fail.\n\n=== RUBRIC CRITERIA (each must be concretely satisfied) ===\n\n1. TOP-DOWN TREE: Recursive calls form a tree, root call at top, growing downward. Each call is one node.\n\n2. BASE-CASE LEAVES MUST BE GREEN \u2014 THIS IS THE MOST-FAILED CRITERION.\n - Do NOT just mention green in prose. Produce an explicit, exhaustive, itemized list titled \"BASE CASE NODES (GREEN):\" that names EACH leaf node by its exact call signature and return value, e.g.\n BASE CASE NODES (GREEN):\n - factorial(1) \u2192 returns 1\n - fib(1) \u2192 returns 1\n - fib(0) \u2192 returns 0\n - Every leaf in your tree must appear in this list and be explicitly assigned color = green.\n - State plainly: \"All nodes in the above list are colored GREEN (#2e7d32 / green fill).\"\n\n3. EACH CALL NODE SHOWS ITS RETURN VALUE on a second sub-line (line 1 = the call, line 2 = `\u2192 returns X`). Show real values, including the arithmetic where helpful (e.g. `\u2192 returns 4 \u00d7 6 = 24`).\n\n4. MEMOIZED / CACHE-HIT NODES MUST BE VISUALLY DISTINCT (blue/info fill) WITH A 'cached' EDGE LABEL.\n - If the algorithm HAS overlapping subproblems (e.g. naive Fibonacci), you MUST identify at least one specific repeated call, render it as a BLUE node, and give the edge into it the label `cached`. Produce an explicit list titled \"MEMOIZED NODES (BLUE):\" naming each cached node by signature, the value it returns from cache, and the parent edge that carries the `cached` label.\n - If the algorithm has NO overlapping subproblems (e.g. factorial, merge sort), state explicitly \"MEMOIZED NODES (BLUE): none \u2014 this algorithm has no repeated subproblems\" AND still define blue in the legend. Do not invent fake cache hits.\n\n5. READS TOP-TO-BOTTOM from the root call (root at top, leaves at bottom).\n\n6. NO OVERLAPS, NO EDGES THROUGH UNRELATED NODES \u2014 and CRITICALLY, NOTHING OFF-CANVAS (this caused 6\u201314 off-canvas nodes in failing examples).\n\n7. LEGEND PRESENT \u2014 THIS IS A FREQUENTLY-FAILED CRITERION.\n - Produce an explicit section titled exactly \"LEGEND:\" mapping THREE colors to roles, in plain text WITHOUT relying on emoji (emoji-only legends scored 0). Use the literal words \"Green\", \"Blue\", and the internal-call color word, e.g.:\n LEGEND:\n - Green = Base case (returns directly, no further recursion)\n - Gray = Internal call (recurses into subcalls)\n - Blue = Memoized / cache-hit (value reused; edge labeled 'cached')\n\n=== GEOMETRY / SIZING RULES (to avoid off-canvas and overlap) ===\n- Assume a bounded canvas. Keep the FULL tree inside it. The biggest cause of failure is wide/deep trees overflowing.\n- Constrain breadth: if the natural tree has more than ~8 nodes on its widest level OR more than ~15 total nodes, DO NOT draw every node full-size at the bottom. Instead, reduce the example size and SAY SO (e.g. prefer fib(5) over fib(6); prefer a 4-element array over 8), OR collapse deep/repeated subtrees into a single summarized/cached node so the drawn tree stays small.\n- Give explicit layout rules: state the number of levels, that the root is horizontally centered, that each parent is centered above its children, that sibling spacing is wide enough that no boxes touch, and that leaves sit on the bottom row inside the canvas margins.\n- Edges connect only direct parent\u2192child as straight/elbow lines; left subtree stays fully left, right subtree fully right, so no edge crosses an unrelated node.\n\n=== DOMAIN FACTS TO GET RIGHT ===\n- factorial(n): linear recursion, single chain, no repeated subproblems \u2192 no cache nodes. Base case factorial(1)=1 (or factorial(0)=1). Returns n \u00d7 factorial(n-1).\n- merge sort: binary divide-and-conquer; base case = single-element subarray returns itself; NO overlapping subproblems \u2192 no cache nodes. Each internal node's return value is the SORTED merged subarray. Keep arrays small (4 elements \u21d2 7 nodes fits; 8 elements \u21d2 15 nodes risks overflow, so prefer 4 or collapse).\n- naive Fibonacci: each fib(n) = fib(n-1) + fib(n-2); base cases fib(0)=0, fib(1)=1 (GREEN). HAS overlapping subproblems \u2192 you MUST mark at least one repeated call BLUE with a 'cached' edge. Left child = fib(n-1), right child = fib(n-2). Keep n small (n=5 max) to fit canvas.\n\n=== OUTPUT FORMAT ===\nConcise plain-text plan with these explicit sections, in order:\n- Title\n- Tree structure (an ASCII sketch AND/OR per-level list showing each node's call + `\u2192 returns X`)\n- BASE CASE NODES (GREEN): exhaustive itemized list\n- INTERNAL CALL NODES: itemized list (with return values)\n- MEMOIZED NODES (BLUE): itemized list, or \"none \u2014 no repeated subproblems\"\n- Edges: parent\u2192child connections, branch labels, and any 'cached' edge labels\n- Layout / geometry rules: levels, centering, spacing, on-canvas guarantee\n- LEGEND: three plain-text color\u2192role mappings (no emoji-only)\nBe concrete throughout: real signatures, real numbers, real array contents.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_recursion-tree/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_recursion-tree/report.md new file mode 100644 index 0000000..05875b5 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_recursion-tree/report.md @@ -0,0 +1,5 @@ +# GEPA journey: recursion-tree + +rubric criteria: 7; val: 2 + +**Seed 0.5360 → Best 0.6896 (+0.1536)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_risk-matrix/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_risk-matrix/best_prompts.json new file mode 100644 index 0000000..931498c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_risk-matrix/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'risk-matrix' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- A grid with Impact on one axis and Likelihood on the other\n- Each cell is tinted from green (low) through amber to red (high) by likelihood \u00d7 impact\n- Individual risks appear as small labeled chips/badges placed in the appropriate cell\n- Both axes are labeled (Impact High\u2192Low, Likelihood Low\u2192High)\n- A swatch legend explains the low/medium/high severity colors\n- The severity gradient reads along the diagonal (low corner to high corner)\n- Cells show or imply the numeric score so color isn't the only channel\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nYou are generating a single valid JSON object describing a UI layout (Mantine-style component tree: nodes have \"type\", \"props\", and \"children\"). The current use case is a RISK MATRIX BOARD. Produce JSON that bakes in the SPECIFIC values the plan names (axis labels, risk chips, scores, colors, legend text, callouts) \u2014 never use placeholders. Output ONLY the JSON.\n\n====================================================================\nTOP PRIORITY \u2014 STOP THE OVERFLOW/CLIPPING (this is the #1 scored failure)\n====================================================================\nPast outputs scored a PERFECT rubric (1.00) but FAILED composition (0.29\u20130.43) every time with \"content overflows/clips\". The rubric content is already correct \u2014 DO NOT add more. Your ONLY job beyond correctness is to make the WHOLE layout fit without overflowing. Treat total content footprint as a hard budget. Apply ALL of the following:\n\n- Wrap everything in ONE Stack with \"gap\": \"sm\" (never \"lg\"; \"md\" only if content is unusually small). Do NOT add page padding (\"p\") that eats width.\n- The matrix Table is the widest element. Keep header labels SHORT \u2014 use the tick number with a 1\u20133 letter abbreviation, e.g. \"Rare (1)\" \u2192 \"R1\", \"Unlikely (2)\" \u2192 \"U2\", \"Likely (4)\" \u2192 \"L4\", \"Almost Certain (5)\" \u2192 \"AC5\". Put the full tick names only in a one-line caption under the table.\n- Inside table cells: keep cell text MINIMAL. Use only the score + band emoji for empty cells (e.g. \"12 \ud83d\udfe8\"). For cells containing a risk, use the score + emoji + a SHORT 2-letter risk CODE only (e.g. \"15 \ud83d\udfe5 DC\"). NEVER put full risk names inside table cells \u2014 full names live only in the summary cards.\n- Set table font small: add \"fz\": \"xs\" (or \"fontSize\":\"xs\") to the Table props so all 6 columns fit horizontally.\n- The risk summary Cards are the second-biggest overflow source. Use SimpleGrid with \"cols\": { \"base\": 1, \"sm\": 2 } and \"spacing\": \"xs\". Card \"padding\": \"xs\". Inside each card stack vertically (NO wide Groups). Keep each card to: risk name (fw 700, size \"sm\"), one line \"score + Badge\", one dimmed \"xs\" breakdown line, and at most ONE short \"xs\" mitigation line. Do not exceed 4 short text nodes per card.\n- The score number inside cards may be \"lg\" at most (not \"xl\"); secondary text is \"xs\".\n- Legend, gradient caption, and decision callout must all use \"xs\"/\"sm\" text.\n- Avoid redundant wrapper Cards around the table when they add width; a plain Table + caption is fine.\n- Prefer vertical stacking everywhere over horizontal Groups. Never place a horizontal row that could exceed container width.\n- Keep total node count lean \u2014 every element you add risks pushing content off-canvas. Include exactly what the rubric needs, nothing decorative.\n\n====================================================================\nDOMAIN KNOWLEDGE \u2014 RISK MATRIX BOARDS (apply even if the plan is terse)\n====================================================================\n\nGRID\n- Build the matrix as a Mantine Table with \"withTableBorder\": true and \"withColumnBorders\": true. Default size is 5\u00d75, but FOLLOW THE PLAN \u2014 if the plan specifies 4\u00d74 (or another size), build that exact grid.\n- Columns = Likelihood (low\u2192high, left to right). Rows = Impact (high\u2192low, top to bottom) so the HIGH-severity corner is TOP-RIGHT and the LOW-severity corner is BOTTOM-LEFT.\n- The header row's first cell labels both axes, e.g. \"Impact \u2193 / Likelihood \u2192\".\n- EVERY body cell must print its numeric score = Likelihood \u00d7 Impact. Color must never be the only signal \u2014 the printed number is mandatory in every cell.\n\nSCORE BANDS / COLORS (use the plan's EXACT band cutoffs; otherwise these defaults)\n- Green (Low): 1\u20136\n- Amber/Orange (Medium): 7\u201314\n- Red (High): 15\u201325\nIf the plan gives different cutoffs (e.g. a 4\u00d74 with Green 1\u20133, Amber 4\u20138, Red 9\u201316), use the plan's cutoffs exactly. Map bands to Mantine color names: green, yellow/orange, red.\n\nHEAT TINTING (rubric-scored \u2014 0.0 when missing)\n- EVERY cell must carry a band-colored emoji square matching its score band: \ud83d\udfe9 green band, \ud83d\udfe8/\ud83d\udfe7 amber band, \ud83d\udfe5 red band. Example: \"12 \ud83d\udfe8\" or \"20 \ud83d\udfe5\".\n- Tint ALL cells including low/green ones \u2014 never leave any cell un-tinted.\n\nDIAGONAL GRADIENT (rubric-scored \u2014 0.0 when missing)\n- Tint must intensify diagonally from bottom-left (lowest score, deepest green) through the mid-diagonal (amber) to top-right (highest score, deepest red).\n- Add ONE short caption beneath the table stating the direction, e.g. \"Severity intensifies diagonally: bottom-left (1) green \u2192 mid (9) amber \u2192 top-right (25) red.\"\n\nRISK CHIPS (rubric-scored)\n- Place EACH named risk into its CORRECT cell, computed as Likelihood \u00d7 Impact. Verify row (Impact) and column (Likelihood) before writing. Double-check the arithmetic.\n- In-cell chip: score + band emoji + SHORT CODE only (e.g. \"15 \ud83d\udfe5 DC\").\n- Also render every chip as a compact summary Card (risk name, score, band Badge, \"Likelihood \u00d7 Impact\" breakdown, short mitigation). Order cards by score DESCENDING.\n\nLEGEND (rubric-scored \u2014 keep complete)\n- Titled Legend with all THREE bands: color swatch emoji, band name, numeric range, AND the action text from the plan (e.g. \"\ud83d\udfe9 Low (1\u20136): Monitor\"). Use a List, one band per line, \"size\":\"sm\".\n\nDECISION CUE / TAKEAWAY (supports the goal)\n- Include the plan's titled callout (Alert) and/or an ordered mitigation list ranked by score descending, with the exact risk names, scores, and mitigation actions from the plan. Keep it to ONE of these (Alert OR ordered list) if both would push content off-canvas \u2014 prefer the compact Alert with a single summarizing sentence.\n\n====================================================================\nOUTPUT\n====================================================================\n- Emit ONE valid JSON object only. No prose, no markdown fences, no comments.\n- Use exact values from the plan (axis tick labels, risk names, computed scores, band cutoffs, legend actions, titles).\n- Self-check before finishing:\n (1) every cell has a score number AND a band-colored emoji;\n (2) the diagonal gradient is visible and captioned;\n (3) every named risk is in its correct cell (verified arithmetic) AND in a summary card;\n (4) the legend lists all three bands with range + action;\n (5) COMPOSITION: Stack gap \"sm\"; table header labels abbreviated; cells use short codes not full names; Table \"fz\":\"xs\"; summary cards 2-col with \"xs\" spacing/padding and vertical stacking; all secondary text \"xs\"/\"sm\"; no horizontal row can exceed container width. The layout MUST be compact enough that nothing clips." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_risk-matrix/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_risk-matrix/report.md new file mode 100644 index 0000000..65c2a68 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_risk-matrix/report.md @@ -0,0 +1,5 @@ +# GEPA journey: risk-matrix + +rubric criteria: 7; val: 2 + +**Seed 0.6101 → Best 0.8474 (+0.2373)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_roadmap/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_roadmap/best_prompts.json new file mode 100644 index 0000000..719f1b5 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_roadmap/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'roadmap' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each initiative is a bar spanning quarter-start to quarter-end\n- The x-axis is formatted by quarter (e.g. Q1 '26), not by day\n- Bars are colored by team/owner\n- A legend maps colors to teams\n- It reads as a strategic roadmap (coarse quarters), not a day-level Gantt with a today rule\n- The chart has a title and subtitle\n- Tooltip shows the initiative, team, and quarter span\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "You are generating a single valid Vega-Lite v6 JSON specification for a \"roadmap board\" style chart (Gantt-style horizontal bars), based on a detailed plan.\n\n{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\n## What you are producing\nA horizontal timeline / Gantt-style bar chart, one row per initiative, where each bar spans from its start boundary to its end boundary. Output ONLY a single valid JSON object (the Vega-Lite spec). No prose, no code fences, no placeholders \u2014 bake in the SPECIFIC titles, teams, colors, initiatives, dates, and spans the plan names.\n\n## Required structure (apply unless the plan explicitly overrides)\n- `\"$schema\": \"https://vega.github.io/schema/vega-lite/v6.json\"`\n- `title` object with `text` (the plan's Title) and `subtitle` (the plan's Subtitle), copied verbatim.\n- `\"width\": \"container\"`, and an explicit `height` sized to the number of rows (~36\u201340px per row).\n- `mark`: `{\"type\": \"bar\", \"tooltip\": true, \"cornerRadius\": 3, ...}` with a sensible bar height/band.\n- `data.values`: one object per initiative.\n\n## Data fields per initiative\nFor each initiative include AT LEAST these fields:\n- `initiative` (string, exact name from plan)\n- `team` (string, exact team name from plan)\n- `start` (ISO date \"YYYY-MM-DD\")\n- `end` (ISO date \"YYYY-MM-DD\")\n- `span` (the human-readable quarter span string, e.g. \"Q1 '26 \u2013 Q2 '26\")\n- an `order` integer for explicit row sorting\n\n### Quarter-to-date mapping (snap bars to quarter boundaries)\n- Q1 = Jan 1 \u2192 Mar 31 (start \"YYYY-01-01\", end \"YYYY-03-31\")\n- Q2 = Apr 1 \u2192 Jun 30 (start \"YYYY-04-01\", end \"YYYY-06-30\")\n- Q3 = Jul 1 \u2192 Sep 30 (start \"YYYY-07-01\", end \"YYYY-09-30\")\n- Q4 = Oct 1 \u2192 Dec 31 (start \"YYYY-10-01\", end \"YYYY-12-31\")\nA multi-quarter bar starts at the start boundary of its first quarter and ends at the end boundary of its last quarter.\n\n## Encoding\n- `y`: field `initiative`, type `nominal`, sorted by the explicit `order` field (or an explicit array) so rows read in the plan's intended top-to-bottom narrative, grouped by team. Axis title \"Initiative\".\n- `x`: field `start`, type `temporal`, with `scale.domain` of the full year (e.g. `[\"2026-01-01\",\"2026-12-31\"]`). Axis title \"Quarter\".\n- `x2`: field `end`.\n- Quarter-formatted axis: show exactly 4 ticks labeled `Q1 '26`, `Q2 '26`, `Q3 '26`, `Q4 '26`. Use `axis.values` at mid-quarter dates (e.g. `[\"2026-02-15\",\"2026-05-15\",\"2026-08-15\",\"2026-11-15\"]`) with a `labelExpr` that maps to the quarter label array, OR tick at quarter starts. Do NOT render day/week gridlines or a \"today\"/now reference line \u2014 coarse quarter columns only.\n- `color`: field `team`, type `nominal`, with `scale.domain` listing teams in plan order and `scale.range` listing the EXACT hex colors named in the plan. Add `legend` with `title: \"Team\"` and `orient: \"top-right\"`.\n\n## CRITICAL: tooltip_details (this is the criterion that keeps failing)\nA tooltip with only one or two fields, or only the pre-formatted span string, scores ZERO. Make the tooltip rich and informative. The `tooltip` array MUST include, at minimum:\n- `initiative` (title \"Initiative\")\n- `team` (title \"Team\")\n- the quarter span (title \"Quarter span\")\n- the actual `start` date as a `temporal` field with a `format` (e.g. `{\"field\":\"start\",\"type\":\"temporal\",\"title\":\"Start\",\"format\":\"%b %d, %Y\"}`)\n- the actual `end` date as a `temporal` field with a `format` (title \"End\")\n\nSurfacing the real start/end dates AND the human span (not just one of them) is what satisfies the tooltip rubric. Always prefer more meaningful tooltip detail over less.\n\nProduce the single JSON object now. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_roadmap/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_roadmap/report.md new file mode 100644 index 0000000..e19231c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_roadmap/report.md @@ -0,0 +1,5 @@ +# GEPA journey: roadmap + +rubric criteria: 7; val: 2 + +**Seed 0.6590 → Best 0.6675 (+0.0085)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_security-threat-model/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_security-threat-model/best_prompts.json new file mode 100644 index 0000000..4362b16 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_security-threat-model/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a 'security-threat-model' board that will be rendered on a fixed canvas and scored against a STRICT rubric. Every criterion must be satisfied explicitly and concretely.\n\nINPUT FORMAT\n- Use case: {topic} (the threat scenario and the attack vectors involved)\n- Reader: {audience}\n- Goal: {purpose}\n\nTASK\nProduce a concise plain-text plan that names the SPECIFIC content (real labels, real values, kinds, colors, edge styles, and x/y coordinates) for every node and edge. Generic plans fail. Each rubric criterion below must be unambiguously and individually satisfiable from your plan.\n\n=== RUBRIC CRITERIA \u2014 satisfy EVERY one explicitly ===\n\n1. ROOT = ASSET. Exactly one root node at the TOP, kind=asset, naming the attacker goal AND the protected asset concretely (e.g. \"ASSET: Customer PII (Customer DB + S3 archives) \u2014 Goal: Exfiltrate\"). Color it neutral gray/blue.\n\n2. VECTORS LAYER (vectors_layer). Every attack vector from the use case is its own node, kind=vector, placed in ONE horizontal row directly below the root. Each vector node gets 1\u20132 descriptive sub-detail nodes (also kind=vector, placed just below their parent vector) describing the concrete technique. Name them specifically.\n\n3. MITIGATIONS LAYER (mitigations_layer). Every vector connects to one or more kind=mitigation nodes naming a concrete, named control (e.g. \"FIDO2/WebAuthn keys\", \"AWS Block Public Access\"). Mitigation nodes form the bottom row(s).\n\n4. STATUS COLORING (status_coloring) \u2014 THIS IS CRITICAL, it scored 0.0 repeatedly. EVERY node AND EVERY edge must carry an explicit mitigation-state color drawn ONLY from these three:\n - mitigated = green (#2e7d32)\n - partial = amber (#f9a825)\n - open = red (#c62828)\n State the color word AND hex for each node and each edge. Do NOT introduce neutral/dark colors anywhere except the single asset root. For each edge, write its color explicitly: an edge into a mitigation takes that mitigation's state color; a vector's edge from the asset takes that vector's worst (most-open) child state color.\n\n5. GAP / DASHED RED (gap_dashed) \u2014 scored 0.0 often. Every unmitigated gap MUST be represented as BOTH:\n (a) a dashed red edge (style=dashed, color=red #c62828), AND\n (b) a node or edge label containing the literal word \"GAP\".\n Represent a gap as a dedicated node, kind=mitigation, color=red, labeled \"GAP: <what is missing>\", connected by a dashed red edge. Ensure at least one gap exists and is rendered this exact way. List each gap edge explicitly as \"dashed red, label GAP\".\n\n6. LEGEND (legend_present) \u2014 scored 0.0 when missing. Include an explicit legend node/box placed in a corner (top-right) that maps:\n - green = Mitigated\n - amber = Partial\n - red = Open\n - dashed red = Unmitigated GAP\n The legend MUST be an actual placed element with its own coordinates, not just prose.\n\n7. LAYOUT TOP-TO-BOTTOM. Row order from top: (row 0) asset root, (row 1) vectors, (row 2) vector sub-details, (row 3+) mitigations & gap nodes. Edges flow downward.\n\n=== GEOMETRY \u2014 prevent off-canvas & overlaps (these tanked every score) ===\nAssume a canvas of 1280 (wide) x 800 (tall). EVERY node must fit fully inside with margin:\n- Keep all x in [40, 1180] and all y in [20, 740] (node anchors are top-left; assume node size ~180w x 70h, so x \u2264 1100 and y \u2264 730 effectively).\n- Give EVERY node explicit (x, y) coordinates in your plan.\n- Distribute nodes evenly across the width; do NOT let any node exceed the right/bottom edge.\n- Minimum horizontal gap between sibling node anchors: 200px. Minimum vertical gap between rows: 130px.\n- NEVER place two nodes at overlapping coordinates. If you have many mitigations, spread them across multiple rows but keep them on-canvas.\n- If the number of vectors is large, reduce sub-detail nodes to keep within width rather than going off-canvas.\n- Place the legend at roughly (1000, 20) and ensure it stays within bounds.\n\n=== COMPREHENSION (comp scores were very low ~0.14) ===\nMake the plan readable and self-explanatory: short node labels (\u22646 words on the visible label, details in subs), a clear one-line answer to the reader's GOAL (e.g. ranked list of open paths), and consistent state vocabulary throughout. Tie the colors back to the goal so the reader instantly sees what is open vs mitigated.\n\n=== OUTPUT FORMAT ===\nPlain text. Provide:\n1. A node table: id | label | kind (asset/vector/mitigation) | state (mitigated/partial/open) | color (word + hex) | (x, y)\n2. An edge table: from \u2192 to | color (word + hex) | style (solid/dashed) | label (blank or \"GAP: ...\")\n3. Legend element with coordinates.\n4. One-line \"Reader takeaway\" answering the goal (e.g. prioritized open paths).\n\nBe concrete: use real control names, real vector techniques, and real coordinates. Verify before finishing that: every node/edge has a green/amber/red color; at least one gap uses a dashed red edge AND a \"GAP\" label; the legend is a placed element; all coordinates are on-canvas with no overlaps; and the tree reads top-to-bottom.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_security-threat-model/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_security-threat-model/report.md new file mode 100644 index 0000000..6226465 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_security-threat-model/report.md @@ -0,0 +1,5 @@ +# GEPA journey: security-threat-model + +rubric criteria: 7; val: 2 + +**Seed 0.1452 → Best 0.1474 (+0.0023)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_sequence-diagram/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_sequence-diagram/best_prompts.json new file mode 100644 index 0000000..b1fb01f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_sequence-diagram/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'sequence-diagram' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a concrete, geometry-explicit plan that will be rendered and scored against a strict automated rubric. The rubric checks ACTUAL coordinates and arrow endpoints, so vague conventions (\"solid = call\") are NOT enough \u2014 every arrow must have explicit numeric start/end points, an explicit direction, and an explicit visual style.\n\n## CANVAS RULES (critical \u2014 past plans lost most points to off-canvas/overlap)\n- Assume a canvas of width 1000 and height 700. EVERYTHING must fit inside: 0 < x < 1000 and 0 < y < 700 with margins.\n- Place at most 4 participants. Keep lifeline X positions well inside the canvas: use X = 140, 380, 620, 860 for four participants (evenly spaced, 240px apart). For three participants use X = 180, 500, 820. For two use X = 250, 750.\n- Header boxes sit at the TOP: each box centered on its lifeline X, top edge at Y=30, box height ~50 (so box spans Y=30\u201380). Lifeline dashed vertical line runs from Y=80 down to Y=660.\n- Arrow Y positions must stay between Y=110 and Y=640. With N messages, space them evenly: pick a vertical step so all arrows fit (e.g., 8 messages \u2192 step ~60px starting at Y=120). NEVER let an arrow exceed Y=640.\n- LIMIT total messages to 8 or fewer so spacing stays large and nothing overlaps. Fewer, clearer messages score better than many crowded ones.\n- Each arrow occupies its own unique Y row. No two arrows share a Y. No arrow label may collide with another arrow.\n\n## ARROW SPECIFICATION (each message MUST list ALL of these explicitly)\nFor every message give a numbered row with:\n1. Y coordinate (unique, in time order top\u2192bottom = earliest\u2192latest).\n2. fromX \u2192 toX as actual numbers (the lifeline X of sender and receiver). The arrow is drawn horizontally at that Y from fromX to toX.\n3. Direction word: the arrowhead is at toX (the receiver). State it explicitly, e.g. \"head at X=620 (Service)\".\n4. Style:\n - CALL/REQUEST = SOLID line, FILLED/closed triangular arrowhead.\n - RESPONSE/RETURN = DASHED line, OPEN/thin (V-shaped) arrowhead.\n Make calls and returns visually opposite. A return for a call from A\u2192B must be drawn as B\u2192A (arrow physically reversed, head pointing back at the original caller).\n5. Label text placed just above the arrow line, with REAL concrete values (method names, status codes, IDs, amounts, tokens) \u2014 never generic placeholders.\n\n## RUBRIC CRITERIA (must satisfy EVERY one)\n- participant_lifelines: each participant has a labeled header box AND a vertical dashed lifeline directly beneath it at the SAME X. State the exact X for both box and lifeline so they align.\n- aligned_lifelines: lifelines evenly spaced, all vertical, same top/bottom Y. Use the exact X set above.\n- ordered_messages: messages in strict vertical time order, earliest at smallest Y. Number them 1..N with increasing Y.\n- call_vs_return: requests and responses must be CLEARLY different \u2014 solid+filled-head vs dashed+open-head. Restate this per arrow.\n- directionality: every arrow's head points from sender to receiver; for returns the arrow is physically reversed. State fromX\u2192toX and \"head at <receiver X>\" for every single arrow.\n- every-arrow-labeled: concrete label on each.\n- every-participant-active: each declared participant must send or receive \u22651 message; list which message numbers touch each participant.\n\n## OUTPUT FORMAT (concise plain text)\n1. Participants: list each with header box X (and Y=30\u201380) and lifeline X (Y=80\u2013660). Confirm even spacing.\n2. Messages table: # | Y | fromX\u2192toX | head-at (receiver) | CALL(solid,filled) or RETURN(dashed,open) | concrete label.\n3. Participant-activity check: each participant \u2192 message numbers it appears in.\n4. Geometry check: confirm all X in (0,1000), all Y in (110,640), no shared Y, even spacing, no overlaps.\n\nBe concrete with real values. Keep within the canvas. Prefer \u22648 messages.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_sequence-diagram/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_sequence-diagram/report.md new file mode 100644 index 0000000..a1e3276 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_sequence-diagram/report.md @@ -0,0 +1,5 @@ +# GEPA journey: sequence-diagram + +rubric criteria: 7; val: 2 + +**Seed 0.2548 → Best 0.5056 (+0.2508)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_service-health/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_service-health/best_prompts.json new file mode 100644 index 0000000..e79c096 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_service-health/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'service-health' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each service tile is colored by health: healthy=green, degraded=amber, down=red\n- Each service tile shows an inline sparkline (latency/throughput trend)\n- Each tile shows a current SLI value (e.g. 'p95 214ms') in a meta line\n- Each tile shows the service role/purpose in a sub line\n- Edges connect services in dependency order, colored to match the target's health\n- A legend maps the health colors to healthy/degraded/down\n- No tiles overlap and edges do not pass through unrelated tiles\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_service-health/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_service-health/report.md new file mode 100644 index 0000000..3cc19f7 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_service-health/report.md @@ -0,0 +1,5 @@ +# GEPA journey: service-health + +rubric criteria: 7; val: 2 + +**Seed 0.5727 → Best 0.5727 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_shopping-comparison/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_shopping-comparison/best_prompts.json new file mode 100644 index 0000000..362039b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_shopping-comparison/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'shopping-comparison' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each shopping option is its own card in a comparison grid\n- Each option shows a prominent product image\n- Each option shows its price\n- The cheapest (or best-value) option is flagged with a badge\n- Each option lists key specs/features for comparison\n- Each option has an outbound buy/visit link\n- Each option shows a rating or review score\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_shopping-comparison/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_shopping-comparison/report.md new file mode 100644 index 0000000..c375250 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_shopping-comparison/report.md @@ -0,0 +1,5 @@ +# GEPA journey: shopping-comparison + +rubric criteria: 7; val: 2 + +**Seed 0.8007 → Best 0.8007 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_sprint-burndown/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_sprint-burndown/best_prompts.json new file mode 100644 index 0000000..43dc52b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_sprint-burndown/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'sprint-burndown' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Two lines are plotted: an ideal burndown and the actual remaining work\n- The ideal line is a straight dashed/grey line from total down to zero\n- The actual line is a distinct color with marked points and stops at today\n- A vertical 'today' rule line marks the current sprint day\n- The y-axis is fixed from 0 to the sprint total so both lines share a frame\n- Axes are titled (sprint day / story points remaining)\n- A legend distinguishes the ideal vs actual series\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_sprint-burndown/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_sprint-burndown/report.md new file mode 100644 index 0000000..e3e25ba --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_sprint-burndown/report.md @@ -0,0 +1,5 @@ +# GEPA journey: sprint-burndown + +rubric criteria: 7; val: 2 + +**Seed 0.6448 → Best 0.6448 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_sre-incident/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_sre-incident/best_prompts.json new file mode 100644 index 0000000..903a6f2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_sre-incident/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'sre-incident' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a concise plain-text plan that names the SPECIFIC content (real values, labels, structure) for an SRE incident board. Generic plans fail \u2014 every field must have concrete, realistic values.\n\n## Rubric \u2014 the plan MUST satisfy EVERY criterion:\n1. A clear incident title at the top of the board\n2. A header stating incident severity, status, and a one-line summary\n3. A timeline of incident events covering detect \u2192 escalate \u2192 mitigate \u2192 resolve\n4. Impact metrics (affected users, error rate, duration) shown as stats\n5. Severity/status is color-coded (red for active/critical, amber/orange for mitigating or SEV2, green for resolved)\n6. Follow-up action items / next steps are listed\n7. An incident commander/owner is explicitly named\n\n## Required content and concrete conventions:\n\n### Incident Title\n- Use an INC ID + descriptive name + affected service, e.g. \"INC-2024-0317: Checkout Service Outage \u2014 Payment Processing Down\" or \"INC-2847: API Gateway p95 Latency Spike \u2014 User-Service Endpoints\".\n\n### Header (color-coded badges + summary)\n- Severity badge with explicit color: SEV1 = solid RED, SEV2/SEV3 = AMBER/ORANGE.\n- Status badge with explicit color: ACTIVE = red, MITIGATED/MITIGATING = amber, RESOLVED = green. Note the transition path (ACTIVE \u2192 MITIGATED \u2192 RESOLVED).\n- One-line summary: a single concrete sentence describing symptom, scope (% of users/requests), and root cause, e.g. \"p95 latency on /v2/users endpoints rose from 180ms to 1.4s for 47 minutes due to connection pool exhaustion after a config deploy.\"\n\n### Incident Commander / Owner\n- Name a specific IC with handle, e.g. \"IC: Priya Sharma (@psharma)\". Optionally add Comms Lead, Ops, Scribe, or affected service owner \u2014 but keep this compact.\n\n### Impact Metrics (stat cards / tiles)\n- Always include: Affected users (count + % of active), Error rate (peak + baseline), Duration (minutes + UTC window). Add 1\u20132 relevant extras (revenue impact, p95 latency, replication lag, failed transactions, requests degraded) appropriate to the incident type.\n- Example: \"~84,000 (40% of active) | 52% peak (baseline 0.2%) | 47 min (14:03\u219214:50) | ~$128K est. lost orders\".\n\n### Timeline\n- Vertical, timestamped (UTC), with color dots, covering the four required phases plus optional investigate/identify steps.\n- Each entry: time + phase label + concrete event (tool names like Datadog/PagerDuty/Argo, alert names, channel names, build/version numbers, root cause).\n- Example: \"14:03 \ud83d\udd34 DETECT \u2014 Datadog alert 'checkout_5xx > 5%' fires; PagerDuty pages on-call\" ... \"14:50 \ud83d\udfe2 RESOLVE \u2014 Error rate back to 0.3%; status \u2192 RESOLVED\".\n\n### Follow-up Action Items\n- Checklist with concrete owner + due date each, optionally priority-tagged (P0/P1/P2).\n- Include a mix: add an alert/monitor, add a regression test or CI gate, a runbook/doc, and schedule a blameless postmortem.\n- Example: \"\u2610 [P0] Add connection-pool leak regression test \u2014 Liang, due Mar 19\".\n\n### Color Key / Legend\n- State the legend explicitly: Red = active/critical/SEV1 \u00b7 Amber/Orange = mitigating/SEV2 \u00b7 Yellow = investigating \u00b7 Green = resolved \u00b7 Gray/Blue = info.\n\n## Layout (specify top \u2192 bottom)\n1. Title bar (full width)\n2. Header row: severity + status badges + one-line summary + IC/roles\n3. Impact stat-card row (4\u20135 tiles)\n4. Two columns: Timeline (left) | Action Items (right)\n5. Color legend (footer)\n\n## Critical quality guidance (from scoring):\n- VISUAL QUALITY & GEOMETRY: Keep the plan compact and balanced so content does NOT overflow or clip. Avoid cramming too many rows/sections (e.g. an oversized 6-row timeline plus a 5-section sidebar with separate legend caused overflow). Prefer the four core phases in the timeline; merge legend into the header rather than a large separate block; limit action items to ~5. Favor concise badges and tidy two-column balance over exhaustive detail.\n- COMPREHENSION: Make every value self-explanatory and unambiguous; show baselines next to peak values, label every stat, and make the severity/status colors and the timeline phases immediately readable. Don't bury the summary or IC.\n- Be concrete everywhere \u2014 real names, real numbers, real timestamps, real tool/alert names.\n\nOutput the plan as concise plain text using the structure above.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_sre-incident/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_sre-incident/report.md new file mode 100644 index 0000000..4901dcf --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_sre-incident/report.md @@ -0,0 +1,5 @@ +# GEPA journey: sre-incident + +rubric criteria: 7; val: 2 + +**Seed 0.8680 → Best 0.9531 (+0.0851)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_stacktrace/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_stacktrace/best_prompts.json new file mode 100644 index 0000000..8e96d2c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_stacktrace/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'stacktrace' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a visual board that renders a single error stacktrace as monospace\ntext. Output a CONCISE plain-text plan that names SPECIFIC content (real function\nnames, real file paths, real line numbers, exact labels). Generic plans fail.\n\nThis is judged against a strict rubric. A great board MUST satisfy EVERY criterion:\n- monospace_lines: the trace is rendered as monospace text lines preserving indentation\n- color_by_role: lines colored by role \u2014 culprit/error line RED, your code BLUE,\n library/runtime DIMMED gray\n- indentation_depth: call depth shown via consistent leading-space indentation, one\n level per frame\n- cause_alert: an alert box calls out the likely cause/culprit frame\n- badge legend: a legend explains the line colors\n- frame legibility: each frame shows the function AND file/location\n- linearity: the stack reads top-to-bottom as one linear call path (no graph/arrows/boxes)\n\nREQUIRED STRUCTURE (produce all of these sections):\n1. Board Title \u2014 the exact error type + message, e.g.\n `TypeError: Cannot read properties of undefined (reading 'name')`.\n2. Badge Legend \u2014 three entries, each with a colored marker emoji + label + meaning:\n - \ud83d\udd34 Culprit / Error \u2014 the line that threw\n - \ud83d\udd35 Your code \u2014 application/component frames\n - \u26aa Library / Runtime \u2014 framework, stdlib, runtime internals (dimmed)\n3. Alert Box (above the trace) \u2014 \u26a0\ufe0f Likely cause: name the SPECIFIC culprit frame\n (function + file:line), state WHY it failed, and give a concrete fix.\n4. The Trace \u2014 a single monospace code block, one frame per line, top-to-bottom.\n5. Layout/Color rules \u2014 restate which concrete frames get which color.\n\nCRITICAL RULES \u2014 learned from scoring failures:\n\nA. Color markers go at the START of each line as a single emoji (\ud83d\udd34/\ud83d\udd35/\u26aa), NOT as\n trailing `[RED]`/`[BLUE]`/`[DIM]` comments. Trailing/end-of-line color labels and\n inline `\u25c0 threw here` markers cause the rubric to fail color detection \u2014 DO NOT use\n them. One leading color emoji per line is what scores well.\n\nB. ONE frame = ONE line. Each line must contain BOTH the function and its file:line in\n that single line, e.g.:\n \ud83d\udd34 at UserCard (src/components/UserCard.jsx:14:23)\n Do NOT split a frame across two lines (function on one line, source statement on the\n next). Multi-line-per-frame formats (like Python's `File \"...\", line N` + source\n line) break indentation_depth scoring. Collapse each frame to a single legible line.\n\nC. Indentation must be consistent: increase leading spaces by a FIXED amount (e.g. +2\n spaces) for each successively deeper frame, so depth reads visually as a staircase.\n The culprit/error is at the top (least indented), runtime is at the bottom (most\n indented). Apply this uniformly to every line including the error header.\n\nD. KEEP LINES SHORT to avoid overflow/clipping (a major penalty). Use SHORT relative\n file paths, not long absolute ones. Replace verbose runtime paths like\n `/usr/lib/python3.11/site-packages/django/core/handlers/base.py` with concise forms\n like `django/core/handlers/base.py`. Keep each rendered trace line comfortably under\n ~70 characters including the indentation and leading emoji.\n\nE. Do not wrap the error message across multiple lines inside the trace block; keep the\n header to one short line. Put the full explanation only in the Alert Box.\n\nF. Use ~8 frames total: 1 red culprit/error line at top, then a few blue \"your code\"\n frames, then several dimmed library/runtime frames at the bottom. Make function\n names, file names, and line numbers realistic and specific to the language/framework\n in the use case.\n\nExample trace block format (follow this exactly):\n```\n\ud83d\udd34 TypeError: Cannot read properties of undefined (reading 'name')\n\ud83d\udd35 at UserCard (src/components/UserCard.jsx:14:23)\n\ud83d\udd35 at ProfilePanel (src/components/ProfilePanel.jsx:27:10)\n\ud83d\udd35 at Dashboard (src/pages/Dashboard.jsx:42:6)\n\u26aa at renderWithHooks (react-dom.dev.js:16305:18)\n\u26aa at beginWork (react-dom.dev.js:21587:16)\n\u26aa at workLoopSync (react-dom.dev.js:26466:5)\n```\n\nKeep the whole plan concise and plain-text.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_stacktrace/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_stacktrace/report.md new file mode 100644 index 0000000..d6b09db --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_stacktrace/report.md @@ -0,0 +1,5 @@ +# GEPA journey: stacktrace + +rubric criteria: 7; val: 2 + +**Seed 0.7551 → Best 0.8537 (+0.0985)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_state-machine/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_state-machine/best_prompts.json new file mode 100644 index 0000000..e3bb7e4 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_state-machine/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'state-machine' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a concrete build plan for a directed state-machine diagram that will be RENDERED and then graded against a STRICT rubric. The plan must be specified so that a renderer following it literally produces a clean diagram. Vague claims and prose assertions (\"all edges have arrowheads\") score poorly because the rendered image is what is graded. Give specific values (colors, shapes, ranks, exact labels, routing side) for every element.\n\n== HARD LESSONS FROM PAST GRADING (obey these) ==\n- TERMINAL_STATE and ARROWHEADS are the most-failed criteria EVEN WHEN described correctly in prose. The fix is to make them structurally unmissable and renderer-safe, not just to assert them.\n- NEVER use self-loop edges (a node pointing to itself). They render without visible arrowheads and tank the arrowheads score. Model \"stay/retry/hold\" behavior as an edge to a DISTINCT node or as a labeled edge to the next real state instead.\n- Keep ALL geometry ON-CANVAS. Do not invent large offset coordinates (e.g. x=+3/-3) or bow edges far out. Keep branch nodes only slightly offset from the column and keep every node within a normal bounding area.\n- Routing branch/return nodes off to one side previously caused edges to overlap unrelated nodes (no_overlap = 0). Prefer placing each branch/terminal node at its OWN distinct rank (its own vertical row), so edges to it are short, nearly-adjacent hops that do not run alongside the central column.\n- Comprehension scores are chronically low. Use small node counts, short concrete domain labels, and an obvious downward happy path. Do not over-clutter with many cross edges.\n\n== RUBRIC CRITERIA (each must be explicitly engineered) ==\n\n1. DISTINCT LABELED NODES\n - List every state as its own node with a short, unambiguous label. No merged nodes.\n\n2. EDGE LABELS (triggering event/action on EVERY edge)\n - Every transition edge \u2014 including branches and edges to terminal states \u2014 carries a named trigger written as a verb/event phrase (e.g. \"paymentCaptured\", \"timer expires (4s)\").\n\n3. ARROWHEADS ON EVERY EDGE\n - For EACH edge, state it has a single filled triangular arrowhead at the DESTINATION end, one consistent style for ALL edges.\n - Explicitly restate the arrowhead for every branch and return edge.\n - NO self-loops, NO plain lines, NO open ends, NO double-headed edges.\n\n4. TERMINAL/FINAL STATE DISTINCTION (make it unmistakable AND structurally real)\n - A terminal state has ZERO outgoing edges. Re-check each candidate: if it has ANY outgoing edge (retry, refund, re-run), it is NOT terminal \u2014 do not style it green.\n - Style EVERY terminal node in MULTIPLE redundant ways at once:\n a. Fill: solid GREEN #2E7D32 with white text.\n b. Border: double-ring, thick.\n c. Label text contains the literal marker \"(final)\".\n - Ensure AT LEAST ONE genuine sink exists. If the happy-path \"success\" state has any outgoing edge, add a dedicated final node downstream and route success \u2192 final node so the final node is a true sink.\n - To avoid the recurring terminal failure: make the LAST node in the central column a dedicated final sink (e.g. \"Delivered (final)\", \"Crossing Complete (final)\", \"Done (final)\"). Put it at the bottom center, give it green fill + double border + \"(final)\", and confirm it has no outgoing edge.\n - State explicitly which nodes are terminal and confirm zero outgoing edges for each.\n\n5. INITIAL/ENTRY STATE\n - A small SOLID BLACK (#000000) filled circle at the very TOP center, clearly separated above all state boxes \u2014 it is a dot, NOT a labeled box.\n - Exactly ONE edge leaves it to the first real state with a label like \"start\"/\"power on\"/\"createOrder\", with a filled triangle arrowhead at that first state.\n\n6. TOP-TO-BOTTOM LAYOUT\n - Primary/happy-path states in a single vertical center column flowing strictly downward.\n - Assign every node an explicit integer vertical rank (rank 1 = start dot at top, increasing downward). Give each branch/terminal node its OWN rank so it sits on its own row.\n\n7. NO EDGE PASSES THROUGH AN UNRELATED NODE\n - Keep the main downward path straight in the center, connecting only vertically adjacent ranks.\n - Place branch/terminal destination nodes on their own rank with a small left/right horizontal offset, so each edge to them is a short hop, not a long line running past other boxes.\n - Route any non-adjacent edge (skip-rank or return) along the LEFT or RIGHT outer margin, staying just outside the column, clear of every box it is not connected to. For each such edge name the margin used and confirm it crosses no unrelated node.\n - Minimize long return/loop-back edges; prefer forward progress to distinct nodes.\n\n== OUTPUT FORMAT ==\nProduce a concise plain-text plan with these sections:\n- Layout Direction (top-to-bottom; note which margin is used for any branch/return edges; confirm no self-loops and all geometry on-canvas).\n- Nodes: numbered list/table with vertical rank, label, fill color/tone, border, and role (initial / intermediate / terminal). For terminals: green #2E7D32 fill + double border + \"(final)\" + confirm zero outgoing edges. Note any small horizontal offset for branch nodes.\n- Edges: a table; for EACH edge give From \u2192 To, the trigger label, the arrowhead (filled triangle at destination), and routing (center / left-margin / right-margin). Add an explicit arrowhead restatement line for every branch and return edge.\n- Criterion Satisfaction: one concrete line per rubric criterion proving satisfaction by citing real labels/colors/ranks (not generic claims). For terminal and arrowheads, explicitly cite the sink node and confirm no self-loops exist.\n\nKeep it tight: few nodes, concrete domain labels, obvious downward flow. Be concrete with real values, labels, and structure.", + "generate": "You generate a single valid JSON object describing a STATE-MACHINE DIAGRAM (a directed graph of states and labeled transitions) for a flowchart/board renderer. Output ONLY the JSON \u2014 no prose, no code fences, no comments.\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\n============================================================\nWHAT THE TASK IS\n============================================================\nConvert the state-machine plan into a renderable graph object. The plan names specific states (nodes), transitions (edges), event labels, terminal/initial markers, and layout. Bake in the SPECIFIC values from the plan \u2014 real state names, real event labels \u2014 never placeholders. Preserve the plan's exact state names and event strings verbatim.\n\n============================================================\nCRITICAL LESSONS FROM PAST SCORING (read carefully \u2014 these are the recurring failures)\n============================================================\nThe rubric is scored on ACTUAL emitted JSON FIELDS, not on prose or plan \"compliance notes\". Past attempts repeatedly failed on `terminal_state` (0.0 EVERY time), `arrowheads` (stuck at 0.5), and `no_overlap` (0.0\u20130.5) even when the JSON looked superficially correct. The fixes below are mandatory.\n\n------------------------------------------------------------\nFIX 1 \u2014 terminal_state (failed 0.0 in EVERY past attempt \u2014 TOP PRIORITY)\n------------------------------------------------------------\nA node may ONLY be typed \"terminal\" if it is a TRUE ABSORBING state: it has ZERO outgoing edges. This is the root cause of every past 0.0.\n - In past failures, nodes were marked \"terminal\" but still had outgoing edges (e.g. Cancelled\u2192Refunded, or a \"resting\" state in a cyclic FSM that loops onward, or Passed\u2192Queued re-run). That is NOT terminal. The scorer checks out-degree.\n - PROCEDURE: After building all edges, compute out-degree for every node. A node is terminal ONLY if out-degree == 0. Type those \"terminal\" + \"status\":\"success\". Type all others \"state\" (or \"initial\").\n - If a state the plan CALLS \"terminal\" actually has an outgoing transition (loop-back, refund, retry, re-run), it is NOT terminal \u2014 type it \"state\" with an appropriate status (e.g. cyclic resting state \u2192 \"success\" status but type \"state\"; do NOT type it \"terminal\").\n - For a purely CYCLIC FSM with no absorbing state (e.g. traffic light that always loops), there may be NO terminal node at all. Do NOT force a terminal type onto a node that loops. It is correct to emit zero terminal nodes in that case.\n - Exactly ONE node must be \"type\":\"initial\".\n\n------------------------------------------------------------\nFIX 2 \u2014 arrowheads (stuck at 0.5 \u2014 never reached full credit)\n------------------------------------------------------------\n - EVERY edge MUST include \"markerEnd\": { \"type\": \"arrowclosed\" } explicitly.\n - Do NOT create self-loop edges (source == target). Self-loops render as ambiguous/degenerate arrows and cost arrowhead + overlap credit. If the plan describes a self-loop (e.g. \"button pressed while green, ignored\"), OMIT that edge entirely \u2014 it carries no state change.\n - Do NOT create two edges between the same ordered (source,target) pair.\n\n------------------------------------------------------------\nFIX 3 \u2014 no_overlap (failed 0.0\u20130.5)\n------------------------------------------------------------\n - Use \"direction\": \"TB\".\n - The graph must read as a clean top-to-bottom DAG. BACK-EDGES (a transition pointing to a node earlier/higher in the flow) cause overlap and route through unrelated nodes. AVOID emitting back-edges:\n * Loop-back / return / retry / re-run / re-enqueue edges that point upward \u2014 OMIT them. They are the #1 cause of no_overlap=0.0. Keep only the forward happy-path and forward branches.\n * If omitting a loop-back would disconnect the intent, prefer keeping the forward DAG clean; the renderer/scorer rewards the clean acyclic layout.\n - Branch exception/terminal states (cancel, fail, refund) downward/outward off the main spine \u2014 never let an edge pass through an unrelated node.\n - Keep node ids unique and short snake_case.\n\n------------------------------------------------------------\nFIX 4 \u2014 labeled_transitions\n------------------------------------------------------------\n - EVERY edge MUST carry its concrete event label at the TOP LEVEL as \"label\" AND duplicated in \"data.label\". Never label-only-in-data. Never empty, never placeholder. Use the exact event string from the plan (e.g. \"payment_confirmed\", \"yellow timer expires (4s)\").\n\n------------------------------------------------------------\nFIX 5 \u2014 completeness / fidelity (comp was low)\n------------------------------------------------------------\n - Preserve the plan's EXACT node label text (e.g. \"Vehicle_Green / DontWalk\", not \"Green\"). For the entry marker, use the plan's name if given; otherwise label it \"START\".\n - Include every forward state and every forward transition from the plan.\n\n============================================================\nSCHEMA (follow exactly)\n============================================================\n{\n \"direction\": \"TB\",\n \"nodes\": [\n {\n \"id\": \"<short_snake_case_id>\",\n \"label\": \"<exact human-readable state name from plan>\",\n \"type\": \"initial\" | \"state\" | \"terminal\",\n \"status\": \"info\" | \"neutral\" | \"warn\" | \"active\" | \"success\" | \"error\",\n \"data\": { \"label\": \"<same human-readable state name>\" }\n }\n ],\n \"edges\": [\n {\n \"id\": \"<source>-<target>\",\n \"source\": \"<node id>\",\n \"target\": \"<node id>\",\n \"label\": \"<exact transition/event name from plan>\",\n \"animated\": false,\n \"markerEnd\": { \"type\": \"arrowclosed\" },\n \"data\": { \"label\": \"<same transition/event name>\" }\n }\n ]\n}\n\n============================================================\nSTATUS / TONE CONVENTIONS\n============================================================\n- Initial/entry state: \"info\"\n- Normal intermediate states: \"neutral\"\n- In-progress / running / active states: \"active\"\n- Warning / yellow / amber / caution states: \"warn\"\n- True terminal SUCCESS state (out-degree 0): \"success\" + type \"terminal\"\n- A \"resting\"/\"done\"-toned state that still loops onward: \"success\" status but type \"state\"\n- Failure / error states: \"error\"\nMap plan colors: green\u2192success, yellow/amber\u2192warn, blue\u2192info, gray\u2192neutral, red\u2192error, in-progress\u2192active.\n\n============================================================\nPROCEDURE\n============================================================\n1. List every state in the plan \u2192 one node each (unique id, exact label at top level and in data.label, valid status).\n2. List every FORWARD transition \u2192 one edge each (top-level \"label\" + \"data.label\", \"markerEnd\" arrowclosed, \"animated\": false, unique id).\n3. DROP these edges entirely: self-loops (source==target), upward back-edges (loop-back/return/retry/re-run/re-enqueue), and any duplicate (source,target) pair.\n4. Compute out-degree of every node from the surviving edges.\n - Mark exactly one entry node \"type\":\"initial\".\n - Mark every node with out-degree 0 as \"type\":\"terminal\" + \"status\":\"success\".\n - All remaining nodes: \"type\":\"state\".\n5. Set \"direction\": \"TB\".\n6. Verify BEFORE emitting:\n - [ ] Exactly one node has \"type\":\"initial\".\n - [ ] Every node typed \"terminal\" has out-degree 0; every out-degree-0 reachable end state is typed \"terminal\" with \"status\":\"success\".\n - [ ] No edge has source == target.\n - [ ] No upward/back edge remains; graph is a clean top-to-bottom DAG.\n - [ ] No duplicate (source,target) edge pairs.\n - [ ] Every edge has non-empty top-level \"label\" and \"data.label\", plus \"markerEnd\":{\"type\":\"arrowclosed\"}.\n - [ ] Every node has top-level \"label\", \"data.label\", and \"status\".\n7. Output ONLY the JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_state-machine/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_state-machine/report.md new file mode 100644 index 0000000..2ae8725 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_state-machine/report.md @@ -0,0 +1,5 @@ +# GEPA journey: state-machine + +rubric criteria: 7; val: 2 + +**Seed 0.2416 → Best 0.6187 (+0.3771)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_swimlane-process/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_swimlane-process/best_prompts.json new file mode 100644 index 0000000..1f99344 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_swimlane-process/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a 'swimlane-process' board that a RENDERER will parse into a diagram on a fixed 1280\u00d7720 canvas. Output is parsed line-by-line as structured data. Prose and markdown tables are NOT reliably parsed and score ZERO. Emit one element per line in flat key=value form. Be COMPACT so the output never gets truncated \u2014 the LEGEND at the end MUST appear in full.\n\nINPUTS:\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\n=========================================================\nCRITICAL GEOMETRY MODEL (this is where past plans failed):\n=========================================================\n- Canvas is 1280 wide \u00d7 720 tall. Treat the ORIGIN (x,y) of every node and the legend as its TOP-LEFT corner, NOT its center. The renderer draws the box from (x,y) extending RIGHT by width and DOWN by height.\n- Fixed node size: width=150, height=56. Therefore a node at (x,y) occupies x..x+150 and y..y+56.\n- HARD BOUNDS so nothing goes off-canvas:\n * x must satisfy 20 \u2264 x AND x+150 \u2264 1260 \u2192 so x \u2264 1110. Never give a node x > 1110.\n * y must satisfy: node fits ENTIRELY inside its lane band with 8px padding.\n- Use HORIZONTAL lanes, flow LEFT \u2192 RIGHT, consistently.\n- Reserve a label column x:0\u2013140 for lane role labels; put nodes at x \u2265 150.\n- NO OVERLAPS. If two nodes share a lane, they MUST differ in x by at least 200 (150 width + 50 gap). Do NOT place two nodes in the same lane at the same x with different y \u2014 keep one row of nodes per lane wherever possible. If you must place two nodes in one lane near the same x, separate them by at least 200 in x.\n\nLANE LAYOUT (use these EXACT bands for 4 lanes; reserves bottom for legend):\n- Lane 1 band: y 20\u2013160 \u2192 node y must be in 28\u201396 (use y=60)\n- Lane 2 band: y 160\u2013300 \u2192 node y must be in 168\u2013236 (use y=200)\n- Lane 3 band: y 300\u2013440 \u2192 node y must be in 308\u2013376 (use y=340)\n- Lane 4 band: y 440\u2013580 \u2192 node y must be in 448\u2013516 (use y=480)\n- LEGEND zone: y 600\u2013700 (no lanes or nodes here).\n(If the use case names exactly 3 lanes, use bands 20\u2013160, 160\u2013300, 300\u2013440 and keep legend at y 600\u2013700. If 5 lanes, shrink bands to 110px each within y 20\u2013570.)\n\nAssign each lane the FULL width x:0\u20131280 (full-length, equal-height, parallel, non-overlapping).\n\n=========================================================\nOUTPUT FORMAT \u2014 emit EXACTLY these sections, one record per line.\nEvery attribute literally present on EVERY line. No tables, no prose paragraphs.\n=========================================================\n\nCANVAS: width=1280 height=720 orientation=horizontal flow=left-to-right\n\nLANES: (one line per lane)\nLANE id=<ID> role=\"<role label>\" orientation=horizontal x_min=0 x_max=1280 y_min=<top> y_max=<bottom> full_length=true equal_size=true\n\nNODES: (one line per node; every node has BOTH label and detail; both must be non-empty phrases, never a single bare word)\nNODE id=<ID> lane=<LANE_ID> label=\"<short action>\" detail=\"<sub/meta detail line>\" x=<x> y=<y> w=150 h=56\n\nHAPPY_EDGES: (one line per happy-path edge; EACH line individually carries color=green AND style=solid)\nHAPPY_EDGE from=<NODE_ID> to=<NODE_ID> from_lane=<LANE_ID> to_lane=<LANE_ID> label=\"<hand-off reason>\" color=green style=solid\n\nERROR_EDGES: (one line per error/retry edge; EACH line individually carries color=red AND style=dashed; include AT LEAST ONE failure/retry edge)\nERROR_EDGE from=<NODE_ID> to=<NODE_ID> from_lane=<LANE_ID> to_lane=<LANE_ID> label=\"<failure/retry reason>\" color=red style=dashed\n\nCROSS_LANE_HOPS: (one line per hand-off edge where from_lane \u2260 to_lane)\nHOP from=<NODE_ID> to=<NODE_ID> from_lane=<LANE_ID> to_lane=<LANE_ID> differ=true label=\"<hand-off reason>\"\n\nLEGEND: (MUST be emitted in full \u2014 do not truncate. Place inside canvas at y 600\u2013700.)\nLEGEND x=160 y=610 w=420 h=80\nLEGEND_ENTRY sample=line color=green style=solid x1=170 y1=640 x2=230 y2=640 text=\"Happy path\"\nLEGEND_ENTRY sample=line color=red style=dashed x1=170 y1=675 x2=230 y2=675 text=\"Error / retry path\"\n\n=========================================================\nRUBRIC REQUIREMENTS (each must be individually detectable):\n=========================================================\n1. ACTOR_LANES: Define exactly the lanes named in the use case. One LANE line each, unique id, single role label, full width, equal height, non-overlapping bands as specified above.\n2. STEP_PLACEMENT: Every NODE has lane=<one lane id> = the actor who performs it.\n3. STEP_CONTEXT: Every NODE has BOTH a non-empty label AND a non-empty detail. Never a single bare word in either.\n4. HAPPY_PATH_COLORED: Every HAPPY_EDGE line carries color=green AND style=solid on the same line. (Do not state the color once globally.)\n5. ERROR_PATH_DASHED: Every ERROR_EDGE line carries color=red AND style=dashed on the same line. Include at least one error/retry edge.\n6. CROSS_LANE_HOPS: For each hand-off, name source node, target node, source lane, target lane, confirm differ=true, and give a hand-off reason label.\n7. LEGEND_PRESENT: Emit the LEGEND block fully with a green-solid sample entry and a red-dashed sample entry, inside canvas bounds, not overlapping lanes/nodes.\n\n=========================================================\nCONTENT REQUIREMENTS:\n=========================================================\n- Use SPECIFIC, real values tied to the use case: concrete IDs (e.g. order/application numbers), SLAs (e.g. \"SLA 48h\"), document names (e.g. \"Form 1003\", \"W-2\"), decision thresholds (e.g. \"DTI \u2264 43%\", \"amount \u2264 $2,000\"), system/gateway names, status codes. Generic plans fail.\n- Build a coherent flow: start node in the first actor's lane, hand off across lanes following the real process, end at the final actor. Include at least one failure/retry loop that crosses lanes back to an earlier actor.\n- Keep node count to roughly 6\u20139 so all nodes fit without overlap given the x \u2264 1110 ceiling and 200px spacing.\n\nSELF-CHECK before finishing (verify silently, then output):\n- Every node: x \u2265 150 AND x \u2264 1110; y within its lane's allowed node-y range; no two nodes in the same lane within 200px of each other in x.\n- Every happy edge line has color=green style=solid; every error edge line has color=red style=dashed.\n- LEGEND block emitted completely (do not let it get cut off \u2014 keep the rest of the output terse to leave room).", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_swimlane-process/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_swimlane-process/report.md new file mode 100644 index 0000000..e5b6406 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_swimlane-process/report.md @@ -0,0 +1,5 @@ +# GEPA journey: swimlane-process + +rubric criteria: 7; val: 2 + +**Seed 0.1328 → Best 0.2273 (+0.0945)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_task-checklist/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_task-checklist/best_prompts.json new file mode 100644 index 0000000..43e976b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_task-checklist/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'task-checklist' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis plan will be rendered as a visual board and judged on FOUR axes: comprehension, visual quality, geometry (no overflow/clipping), and a strict rubric. Past boards reliably score 1.00 on visual quality, geometry, and rubric, but LOSE the most points on COMPREHENSION. Your top priority is maximizing comprehension WITHOUT breaking the other three.\n\nA great board MUST satisfy EVERY rubric criterion:\n- A checklist with multiple labeled items is shown\n- Items show clear checked vs unchecked states (visible tick boxes)\n- The list includes both completed and pending items\n- Items are grouped into clearly distinct labeled sections (e.g. an agent/automated section and a human sign-off section)\n- The checklist has a clear title naming the specific process\n- Overall progress is evident from the ratio of checked to unchecked items\n- Each item's label is legible and describes a concrete step\n\nCRITICAL CONSTRAINTS (learned from scoring):\n\n1. KEEP IT SMALL TO AVOID OVERFLOW. Use EXACTLY 3 sections (3 sections scored higher on comprehension than 2). Keep the TOTAL number of items between 6 and 8 \u2014 aim for 7. Do NOT exceed 8 items or 3 sections; ~14 items overflow/clip and score badly. Aim for 2\u20133 items per section.\n\n2. STRONG SECTION GROUPING DRIVES COMPREHENSION. Make sections semantically distinct, clearly named, and ordered logically. Prefer a clear pipeline: automated/system steps FIRST, then a hygiene/preparation group, then HUMAN SIGN-OFFS LAST. Examples of strong distinct headers: \"Automated Gates (CI-verified)\", \"Repo Hygiene\", \"Human Sign-offs\", \"Automated Steps (Agent-ticked)\". Each section MUST have its own header with a per-section progress count like [3/4]. Generic or weakly-distinguished groupings lose comprehension points.\n\n3. MIX STATES IN EVERY PLAN. Include both checked (\u2611) and unchecked (\u2610) items overall \u2014 roughly 4 checked and 3 unchecked, so progress is partial (~57%), not all done or all pending. Within each section, order completed (\u2611) items above pending (\u2610) items.\n\n4. MAXIMIZE COMPREHENSION \u2014 THIS IS WHERE POINTS ARE LOST. Make every item instantly understandable:\n - Each label = ONE concrete, specific action, kept SHORT and plain.\n - Each label has a brief real-value detail: a name, ID, count, file path, percentage, build number, or timestamp.\n - Make the checked/unchecked meaning unambiguous from the detail itself (e.g. \"482/482 passed\" reads as done; \"awaiting @priya-k review\" reads as pending).\n - Avoid long parenthetical instructions inside labels. No vague verbs.\n - Use SPECIFIC, real-looking values throughout. Generic plans fail comprehension.\n\nOUTPUT FORMAT \u2014 produce a concise plain-text plan with these parts:\n\n1. TITLE: A specific process title naming the real subject (e.g. \"Release v2.4.1 \u2014 Deployment Checklist\", \"PR #482 Merge Checklist\"). Add a subtitle showing overall progress as \"X / Y complete (Z%)\" \u2014 use the real ratio, ~4/7 (57%). Optionally append one identifying real detail (flight number, build, timestamp).\n\n2. SECTIONS (exactly 3): For each section give:\n - A distinct section header naming the group type (automated vs human, or themed pipeline categories), with a per-section count [done/total].\n - Its items, each as: a checkbox state (\u2611 for checked / \u2610 for unchecked) + a short bold action label + a brief real-value detail.\n - Use \u2611 for completed items and \u2610 for pending items. Order completed items above pending within each section. Keep the human sign-off section last when applicable.\n\n3. RUBRIC COVERAGE: A short list explicitly stating how each of the 7 criteria is met, citing concrete content (item counts, the checked/unchecked ratio like 4 checked / 3 pending, the section names, the title, the progress figure, and example real details).\n\n4. VISUAL NOTES (brief): Stacked card layout, one section per card, three cards top to bottom; top title bar with the title, subtitle, and a progress bar filled to ~the stated %; \u2611 checked items in green; \u2610 pending items in amber/gray; section headers bold with right-aligned count badges ([3/4] etc.). Emphasize compact spacing, fixed card width, and short labels so nothing overflows or clips.\n\nUse SPECIFIC, real-looking values throughout (IDs, names, file paths, counts, percentages, timestamps). Generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_task-checklist/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_task-checklist/report.md new file mode 100644 index 0000000..945d4a9 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_task-checklist/report.md @@ -0,0 +1,5 @@ +# GEPA journey: task-checklist + +rubric criteria: 7; val: 2 + +**Seed 0.6107 → Best 0.7281 (+0.1174)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_task-progress/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_task-progress/best_prompts.json new file mode 100644 index 0000000..22f4c41 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_task-progress/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'task-progress' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nOutput a CONCISE plain-text plan that names the SPECIFIC content (real values, labels, rows, structure) for a board. Invent realistic concrete data fitting the use case (names, IDs, dates, counts, ETAs). Generic plans fail \u2014 every element must have real values.\n\nThe board is judged on comprehension, visual quality, geometry (NOTHING may overflow/clip), and a strict rubric. PRIORITIZE comprehension and fitting-in-the-space over packing in maximum detail \u2014 past versions failed by overflowing and becoming illegible. Keep it compact and balanced.\n\n== HARD SIZING RULES (to avoid clipping \u2014 the #1 past failure) ==\n- The work-items table must have AT MOST 8\u201310 rows. If the use case has more items, show only the active/at-risk/recently-changed rows and note \"(+N more in backlog)\". Do NOT list every item.\n- Keep each table cell SHORT: detail = one terse phrase (e.g. \"ETA Tue, lag 4s\"), not a sentence.\n- Prefer a 2-column layout over 3 columns (3-column grids overflowed). Left = overall + sub-progress + state counts; right = table on top, then risks, recent log, next-up stacked. Keep blocks small.\n- Cap each side-rail section (risks / recent / next-up) at 3\u20134 lines each.\n- Use compact labels and progress bars of fixed small width (e.g. 10 chars).\n\n== EVERY rubric criterion MUST be satisfied \u2014 give concrete content for each ==\n1. CLEAR OVERALL COMPLETION INDICATOR: a hero stat with denominator, e.g. \"33 / 60 done (55%)\" plus a % progress bar. Never a vague status word alone.\n2. SUB-PROGRESS BY PHASE/WORKSTREAM/CATEGORY: 3\u20135 named sub-groups each with its own fraction + %/bar (e.g. \"Wave 3 \u2014 Databases: 8/12 (67%)\"). Not a single number.\n3. STATE COUNTS strip: explicit Done / In-Progress / Blocked / To-Do counts with icons (\u2705 \ud83d\udd04 \u26d4 \u2b1c).\n4. WORK-ITEMS TABLE: columns Item | Owner | Status | Detail. EACH row needs owner + status + a concrete detail (ETA, result, or note) \u2014 never a bare title. 8\u201310 rows max.\n5. BLOCKERS/RISKS: an explicit red callout listing each blocked/at-risk item with owner + cause, or state \"None \u2014 all clear\". Call out critical-path/slip risk.\n6. RECENT ACTIVITY LOG (do NOT skip \u2014 one example scored 0.0 here): 4\u20135 TIMESTAMPED past events, newest first, each \"<when> \u2014 <what> \u2192 <new state> (owner)\". Must convey temporal context (last completed, when).\n7. NEXT UP / UPCOMING (consistently the weakest criterion \u2014 make it STRONG): a clear forward queue of 3\u20135 specific upcoming actions, EACH with a date/day AND an owner AND the concrete action (e.g. \"Tue 22:00 \u2014 payments-db cutover (S. Liu)\"). Include escalations/decisions due. Make it unambiguous what happens next and who does it. This is not optional filler.\n8. INFORMATION-DENSE YET LEGIBLE: pack status + context but stay scannable and unclipped. Use boxed/separated sections, color-coded status chips, aligned columns. Density must not come at the cost of fitting.\n\n== STRUCTURE OF YOUR OUTPUT ==\nTitle bar (name, owner/lead, target date or T-minus, last-updated).\nThen numbered sections covering all 8 criteria above with real values.\nEnd with a short LAYOUT note describing the 2-column grid and how blocks are separated for scannability, explicitly stating it is sized to fit without clipping.\n\nBe concrete, be compact, satisfy every criterion, and never overflow.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_task-progress/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_task-progress/report.md new file mode 100644 index 0000000..11859bb --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_task-progress/report.md @@ -0,0 +1,5 @@ +# GEPA journey: task-progress + +rubric criteria: 8; val: 2 + +**Seed 0.5791 → Best 0.6727 (+0.0937)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_task-tracker/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_task-tracker/best_prompts.json new file mode 100644 index 0000000..41e532e --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_task-tracker/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'task-tracker' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- A progress ring shows overall completion percentage\n- Stat cards summarize counts by status (e.g. done / in-progress / todo / blocked)\n- A task table lists tasks with their status, owner, and other columns\n- A blocker alert highlights at least one blocked item\n- Task statuses are color-coded consistently across the ring, cards, and table\n- The layout reads top-down as a report (ring/stats, then table, then alert)\n- The tracker has a clear title naming the sprint/release\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_task-tracker/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_task-tracker/report.md new file mode 100644 index 0000000..8855eff --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_task-tracker/report.md @@ -0,0 +1,5 @@ +# GEPA journey: task-tracker + +rubric criteria: 7; val: 2 + +**Seed 0.6364 → Best 0.6364 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_trace-waterfall/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_trace-waterfall/best_prompts.json new file mode 100644 index 0000000..92c6567 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_trace-waterfall/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'trace-waterfall' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each span is a horizontal bar from its start ms to end ms\n- Spans are sorted by start time down the y-axis so they cascade\n- Spans are colored by service as a second channel (in addition to y-position)\n- The x-axis domain is fixed from 0 to the total trace duration\n- Parent spans visibly bracket the time range of their children\n- Tooltip shows span, service, start, and duration\n- The chart title/subtitle states the request and total duration\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_trace-waterfall/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_trace-waterfall/report.md new file mode 100644 index 0000000..1a38000 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_trace-waterfall/report.md @@ -0,0 +1,5 @@ +# GEPA journey: trace-waterfall + +rubric criteria: 7; val: 2 + +**Seed 0.6863 → Best 0.6863 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_user-journey/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/journey_user-journey/best_prompts.json new file mode 100644 index 0000000..b4cbde1 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_user-journey/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are generating a renderable 'user-journey' board that gets parsed by a STRICT automated text parser into a node-edge diagram, then scored by a rubric. The parser is literal and fragile: it keys on exact tokens, exact delimiters, and bare lines. Markdown decoration (bold **, code fences ```, tables with |---|, headers with extra symbols, bullet dashes before data rows, emphasis) HIDES data from the parser and scores ZERO. Output data lines as PLAIN bare lines with the exact pipe/arrow delimiters specified \u2014 no leading \"-\", no \"**\", no surrounding code fences around the data rows themselves.\n\nINPUTS\n- Use case: {topic}\n- Reader: {audience}\n- Goal: {purpose}\n\n================ CRITICAL PARSER RULES (these caused past 0.0 scores) ================\n1. Do NOT wrap node lines, edge lines, or the legend in markdown code fences (```). Do NOT prefix data rows with \"- \" or \"* \". Do NOT bold any data. Each data line must start with its id or its key token.\n2. Use these EXACT delimiters with single spaces around them:\n - Node line: id | tone | glyph | label | subline | x,y\n - Edge line: from -> to | color | style | label\n - The arrow is exactly \"->\" (hyphen + greater-than). Coordinates are \"x,y\" (e.g. 300,60) NOT \"(x, y)\".\n3. tone/color tokens must be EXACTLY one bare lowercase word: blue, amber, green, or red. No other formatting.\n4. The LEGEND must be four bare lines, each exactly \"color = State\", NOT in a table, NOT bulleted, NOT fenced.\n\n================ GEOMETRY (past runs reported 1\u20133 off-canvas) ================\nPast boards put nodes too low and too far right and still got flagged off-canvas because the parser adds node width/height to the coordinate. Use SAFE margins:\n- Canvas: width 800, height 1200.\n- Direction: TB (top-to-bottom), y increases downward.\n- Main spine x = 300. Failure/branch nodes x = 540 (NOT 560 \u2014 node width ~200 means 560+200=760 risks the boundary; keep x <= 540 so x+width stays well under 800).\n- Vertical spacing 130px starting at y = 60. Keep the LAST node's y <= 1000 (NOT 1140 \u2014 leave room for node height ~80 so y+height stays under 1140).\n- Use AT MOST 7 main-spine nodes so the bottom stays safe: with 7 nodes, last y = 60 + 6*130 = 840. With 6 nodes, last y = 840 max. Prefer 6\u20137 main nodes.\n- NEVER let any x exceed 540 or any y exceed 1000.\n\n================ NODE REQUIREMENTS ================\n- ids: n1, n2, n3, ... in order.\n- Required tone ordering DOWN THE PAGE (this IS \"tone progression\"):\n * EXACTLY ONE blue node, FIRST, the entry (n1).\n * Then a contiguous run of amber nodes (the active steps).\n * Then EXACTLY ONE green node, LAST on the main spine, at the bottom (success outcome).\n * Never interleave: no amber after green, no green above amber, only one blue and one green.\n- At least ONE red node = the failure/error branch, placed offset-right at x=540, at the SAME y as the decision node it branches from.\n- glyph: a single relevant emoji (\ud83d\udd11 sign-in, \u2709\ufe0f email, \ud83d\udcb3 payment, \ud83d\uded2 cart, \ud83d\udce6 shipping, \u26a0\ufe0f error, \ud83d\udd04 retry, \u2705 success, etc.). Never omit it.\n- subline: one concrete technical detail \u2014 real endpoint, screen path, status code, or metric (e.g. \"POST /auth/forgot \u00b7 token TTL 60min\", \"/checkout/pay \u00b7 Stripe 402\"). No placeholders.\n\n================ EDGE REQUIREMENTS ================\n- color MUST EQUAL the tone of the TARGET (to-) node. Edge into amber = amber; into green = green; into red = red; into blue = blue. Verify EVERY edge against its destination node's tone.\n- style = solid normally. The edge pointing INTO the red failure node MUST be \"dashed\" AND its color MUST be \"red\" (red node reached by a dashed red edge \u2014 both required).\n- A decision/step node has TWO outgoing edges: one solid edge to the next success-path node, and one dashed red edge to the red failure node.\n- Optional recovery edge: red node -> an earlier amber node, color amber, style dashed.\n- label: short transition word (e.g. valid, declined, retry, verified).\n\n================ OUTPUT ORDER (data first, prose last and minimal) ================\nPrint these sections in order. Use a plain text section header line (e.g. \"NODES\") with NO markdown symbols on the data rows beneath it.\n\nCANVAS\nwidth 800\nheight 1200\ndirection TB\nspine x 300\nbranch x 540\n\nNODES\n(one bare node line per node, format: id | tone | glyph | label | subline | x,y)\n\nEDGES\n(one bare edge line per edge, format: from -> to | color | style | label)\n\nLEGEND\nblue = Entry\namber = Step (active)\ngreen = Success\nred = Failure\n\nSELF-CHECK\ntone progression: blue entry = <id>; amber steps in order = <ids>; green success = <id> (confirm it is the last/bottom node).\nfailure branch: red node = <id>; dashed red edge into it = <from -> to> (confirm color=red, style=dashed).\nlegend present: confirm all four bare pairs printed above.\nedges match target: spot-check the success edge and the failure edge \u2014 state each edge's color equals its target node's tone.\ngeometry: confirm every x <= 540 and every y <= 1000.\n\nCALLOUTS\n(2\u20134 brief lines, tailored to the reader and goal: drop-off % for a PM, friction points for a CRO specialist, ticket causes for a support lead. Use concrete numbers. Keep AFTER all structured sections.)\n\n================ CONTENT ================\nMake every value SPECIFIC to the given use case: real screen names, endpoints, status codes, and relevant percentages. Generic/placeholder content fails. The structured NODES/EDGES/LEGEND/SELF-CHECK come FIRST because they are what gets rendered and scored; keep prose to the CALLOUTS section only.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/journey_user-journey/report.md b/scripts/experiments/gepa-flowchart/overnight/journey_user-journey/report.md new file mode 100644 index 0000000..9b22b90 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/journey_user-journey/report.md @@ -0,0 +1,5 @@ +# GEPA journey: user-journey + +rubric criteria: 7; val: 2 + +**Seed 0.3634 → Best 0.5671 (+0.2037)** diff --git a/scripts/experiments/gepa-flowchart/overnight/linear_instance/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/linear_instance/best_prompts.json new file mode 100644 index 0000000..1551bb2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/linear_instance/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a downstream diagram generator. The generator renders your plan LITERALLY and naively. Empirically it has these failure modes, which you MUST design around:\n- It CLIPS heavily: plans of 10\u201312 nodes routinely rendered with 8\u201310 nodes OFF the visible canvas. The effective safe budget is far below 12.\n- It draws spine edges STRAIGHT THROUGH any node placed beside the spine. EVERY past plan that placed a decision's off-ramp \"directly beside\" the decision produced \"edge-over-node\" errors, because the edge from the decision to the NEXT spine node visually crosses the side off-ramp. Side-branches DO NOT WORK. Treat this as a hard law.\n- It only renders the first pane reliably and never renders cross-pane edges.\n\nVisual rendering is the #1 cause of score loss. Geometry rules are hard constraints. But comprehension is the second cause of loss \u2014 you must ALSO pack dense specifics into labels. Both matter.\n\nInputs you will receive:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nYour job: produce a concise plain-text plan describing what the flowchart must show so the stated audience can FULLY understand the subject, while rendering cleanly and completely on ONE screen.\n\n=====================================================================\n== HARD GEOMETRY CONSTRAINTS (these dominate the score) ==\n=====================================================================\n\nG1. NODE BUDGET \u2264 9 NODES, TARGET 7\u20138. Count EVERYTHING: trigger, steps, decisions, failure/off-ramp nodes, and end states. Past boards at 10\u201312 planned nodes lost 8\u201310 nodes off-canvas. Do NOT plan to the old 12 ceiling. If the subject won't fit in 9, RUTHLESSLY CUT to the single most decision-critical slice. Fewer, denser nodes beat more, clipped ones.\n\nG2. STRICTLY LINEAR TOP-DOWN SPINE \u2014 NO SIDE BRANCHES, NO PARALLEL OFF-RAMPS. This is the most important rule and the one every past plan violated. The generator runs the spine edge through any node sitting beside the spine. Therefore:\n - Build ONE straight vertical chain: node1 \u2192 node2 \u2192 \u2026 \u2192 nodeN.\n - A decision node must have its branches go to the IMMEDIATE NEXT node down (pass/continue) and have the failure/off-ramp be the VERY NEXT node in the chain \u2014 never a node placed to the side.\n - To make a failure terminal, place it as the next node DOWN in the linear chain, then continue the main flow only if it is the success path. Do NOT fan multiple off-ramps off one decision into parallel side terminals.\n - Because budget is tight, COLLAPSE failure handling: instead of one terminal per branch, use AT MOST ONE OR TWO terminal nodes total at the bottom of the chain, and encode the distinct failure responses inside the labels of the decision nodes themselves.\n - Explicitly state in the Layout Plan: \"Single straight vertical chain; every edge connects consecutive nodes only; NO side-branches, NO parallel off-ramps; no edge skips or crosses any node.\"\n\nG3. ZERO LONG-RANGE EDGES. Every edge connects consecutive entries in your linear node list. Never route back to an earlier node or forward past an intervening node. No loops drawn as edges (describe retry semantics inside a label instead of drawing a back-edge).\n\nG4. PREFER ONE SINGLE SELF-CONTAINED FLOW. Do not split into panes unless absolutely unavoidable. If you must, each pane is a COMPLETE independent linear flow of \u2264 7 nodes with its own trigger and terminal, and you must write \"render each pane as a separate diagram\"; assume NO cross-pane edges render.\n\nG5. COMPACT & BALANCED. Keep the chain short (7\u20138 nodes). Short labels only.\n\nG6. SHORT LABELS, BUT DENSE WITH CONCRETE VALUE + ACTOR. \u2264 ~10 words per node is acceptable if needed to fit specifics; the generator shows ONLY the label text, never your prose. The concrete number(s) AND the actor MUST be inside the label. e.g. \"RN: K\u207a>6.0 \u2192 page MD \u226415min (PagerDuty)\".\n\n=====================================================================\n== CONTENT REQUIREMENTS (must survive INTO the short node labels) ==\n=====================================================================\nReaders repeatedly failed to answer questions because specifics lived only in prose, not in rendered nodes, OR because the relevant node was clipped off-canvas. Since the budget is now small, you must PACK multiple specifics per label rather than spreading them across many nodes.\n\nC1. CONCRETE VALUES, NOT CATEGORIES \u2014 inside the label. Every threshold, window, count, timeout, retry, quantity gets a realistic specific number.\n - When the domain has a TABLE of values (e.g. P1\u2013P4 priorities with SLAs, VAT rates by region, category\u2192queue routing map, multi-parameter screening criteria), put the WHOLE small table compactly INTO ONE node's label rather than omitting it or splitting it across nodes. e.g. \"L1 set prio: P1 biz-down 1h / P2 2h / P3 8h / P4 24h\". Omitting the table was a repeated comprehension failure.\n - Give EACH parameter's value (e.g. \"temp>38\u00b0C, HR>90, RR>20, WBC>12k\"), not \"\u22652 criteria\".\n\nC2. ACTOR / DECISION-MAKER in EVERY node \u2014 especially WHO evaluates each decision and WHO performs each lookup. Name the specific role, named coordinator, automated tool, or external service. Mark automated vs human-in-the-loop. Past readers specifically flagged \"which actor performs the rate/lookup step\" \u2014 name the doer for every distinct sub-action, including lookups, not just classification.\n\nC3. TRIGGER + INITIATING ACTOR in the first node: the precise entry event AND the system/person that fires it.\n\nC4. INPUT RESOLUTION & PRECEDENCE where the flow consumes ambiguous input. If the flow keys off an attribute that can come from multiple sources (e.g. shipping vs billing vs IP geolocation; B2B vs B2C; product category), state in a label HOW it is resolved and which source WINS on conflict. This was a repeated \"not shown\" comprehension gap.\n\nC5. ORDERING & DEPENDENCIES explicit, including the BASIS of computation (e.g. \"tax on FX-converted amount, not base\"; \"persist-then-act\"; which step is BLOCKING). Readers asked whether tax is on converted vs base amount, and which step blocks \u2014 answer these inside labels.\n\nC6. FAILURE / ERROR PATHS WITH SPECIFICS \u2014 but encoded compactly. Because side-branches are banned and budget is tight, fold failure semantics INTO the decision label and a single shared terminal. For each handled failure specify, in the label: the SPECIFIC trigger (error code / exception / timeout value), the response (block / retry / rollback / escalate / serve-last-good), the retry count + backoff, the timeout before escalation, and WHO is notified BY WHAT METHOD (\"PagerDuty on-call\", \"page MD\", \"Slack #alerts\"), plus any DLQ / manual-reconciliation destination.\n - Cover the failure of an ACTION ITSELF, not just \"didn't resolve\": provisioning error, quota exceeded, unreachable dependency, build/publish failure, stale-token rejection, analyzer/QC failure, partial/split writes (e.g. \"online write OK but offline write fails\"), bounce-back/misroute from a downstream queue. PARTIAL-failure and bounce-back paths were repeatedly flagged as \"not shown\".\n - State the DISPOSITION explicitly: on failure, is the previous version served, is the request errored, is stale data flagged? Don't leave it ambiguous.\n\nC7. RETRYABLE VS NON-RETRYABLE \u2014 distinguish in the label (retryable = count/backoff; terminal = halt/escalate).\n\nC8. STATE PERSISTENCE / RECOVERY where relevant: how state is saved and how a crashed coordinator resumes/compensates (\"replay saga log, resume last step, idempotency key prevents double-charge\") \u2014 packed into a step label.\n\nC9. ANTI-FLAPPING / DEFAULT-TO-SAFE where relevant: cooldowns, \"when outcome unknown after timeout \u2192 assume FAILED / halt / degrade\", fallback-to-default/null on cache miss. The cache-miss / default-fallback condition was a repeated gap.\n\nC10. NEWCOMER CONTEXT: inline glossary defining domain terms so the audience reads the diagram standalone.\n\nGiven the small budget, FIT these by PACKING specifics into few labels, choosing the single most decision-critical slice. Priority order if forced to cut: trigger+actor \u2192 the 2\u20133 key decisions with full concrete thresholds and named evaluators \u2192 the dominant failure path with notify target + disposition \u2192 input resolution/precedence and computation basis.\n\n=====================================================================\n== OUTPUT FORMAT (plain text, concise) ==\n=====================================================================\n- Purpose & Scope (1\u20132 lines)\n- Newcomer Context / Legend (define terms inline)\n- Trigger & Initiating Actor (exact event + who/what fires it)\n- Key Steps/States (numbered linear chain; each = the SHORT NODE LABEL with actor + concrete value(s); pack tables/specifics into the relevant label)\n- Decisions & Branch Triggers (each: who evaluates, exact condition with concrete value(s), where the pass branch goes (next node), and the failure response encoded in-label routing to the shared terminal)\n- Failure/Error Paths (each: specific trigger/code/timeout, response, retry/backoff, notify target+method, disposition (prev-version/errored/flagged), terminal/DLQ \u2014 folded into labels, minimal extra nodes)\n- End States (1\u20132 shared terminals at the bottom of the chain)\n- Layout Plan: total node count (\u22649, target 7\u20138, with explicit tally), top-down direction, single-flow (preferred) vs pane-split (each \u22647, self-contained, no cross-pane edges), and the explicit statement: \"Single straight vertical chain; every edge connects consecutive nodes only; NO side-branches, NO parallel off-ramps; no edge skips or crosses any node.\"\n\nSelf-check before finishing:\n(a) Is the total \u22649 (ideally 7\u20138) with a written tally?\n(b) Is the plan a SINGLE straight chain with NO side-placed off-ramps and NO parallel terminals (the recurring fatal geometry error)?\n(c) Does every edge connect only consecutive nodes?\n(d) Does every concrete value, table, actor, input-precedence rule, computation basis, and failure disposition live INSIDE a node label, not just prose?\nIf any answer is no, cut scope, collapse failures into labels, and revise.", + "generate": "You generate a termchart `flow` diagram as JSON.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Use clear, specific labels; label edges with the condition/trigger; group related nodes. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/linear_instance/report.md b/scripts/experiments/gepa-flowchart/overnight/linear_instance/report.md new file mode 100644 index 0000000..6e09848 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/linear_instance/report.md @@ -0,0 +1,14 @@ +# GEPA flowchart optimization — linear_instance (RECOVERED from gepa_state) + +- iterations: 6 total evals: 122 candidates: 5 + +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) | 0.4385 | 0.467 | 0.330 | 0.531 | [None] | +| 1 | 0.5365 | 0.605 | 0.349 | 0.647 | [0] | +| 2 | 0.5687 | 0.663 | 0.389 | 0.603 | [1] | +| 3 **BEST** | 0.6001 | 0.725 | 0.479 | 0.469 | [2] | +| 4 | 0.5573 | 0.676 | 0.358 | 0.559 | [2] | + +**Seed 0.4385 → Best (idx 3) 0.6001 (+0.1616)** + diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_ablation-study/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ablation-study/best_prompts.json new file mode 100644 index 0000000..5942926 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ablation-study/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning an 'ablation-study' board that will be rendered and judged against a strict rubric. Each rubric criterion is checked against the ACTUAL rendered board, so every criterion must map to one explicit, simple, clearly-labeled visual element. Describing an idea is not enough \u2014 name the literal text/value that will appear on the board.\n\nINPUT FORMAT\n- Use case: {topic} (the model/system and which components are ablated, plus the eval metric)\n- Reader: {audience}\n- Goal: {purpose}\n\nOUTPUT\nProduce a concise plain-text plan with concrete, real content (specific labels, numeric values, ordering). Avoid markdown tables and avoid decorative extras (no ghost segments, drop brackets, intensity-scaled colors, shaded gap regions, multi-line captions). Keep it minimal so nothing overflows or clips. Use a simple, fixed structure that directly satisfies each rubric criterion below.\n\nREQUIRED STRUCTURE (fill in with real values for the given use case):\n\n1. TITLE \u2014 One line that explicitly names BOTH the model/system AND the metric being ablated.\n Example form: \"<Model name>: <Metric name> Ablation of <what is ablated>\"\n The metric name in the title must be the SAME metric word used on the Y axis.\n\n2. Y AXIS \u2014 State the axis TITLE as the full metric name with units, e.g. \"Validation Accuracy (%)\".\n - The Y-axis title text must literally contain the metric name (e.g. \"Accuracy\", \"Success Rate\", \"EM\").\n - Start the axis at 0 unless deltas are tiny; if zoomed, keep range modest and label it. Never let bars or labels clip the top \u2014 leave headroom above the tallest bar for value labels.\n\n3. BARS \u2014 One vertical bar per ablated component/variant (each bar = \"remove ONE component\").\n - PLUS one bar for the full/baseline model. Make the baseline a clearly labeled bar (e.g. label \"Full model\").\n - List every bar with its exact label and exact metric value.\n - Bar labels should be short (e.g. \"no Mixup\", \"no Reranker\") so text does not overflow.\n\n4. BASELINE REFERENCE \u2014 Show the full/baseline model in TWO ways for robustness:\n (a) as a distinct-colored bar, AND\n (b) as a horizontal reference rule/line drawn across the chart at the baseline value, labeled with the baseline value (e.g. \"Full = 87.4%\").\n Both must be present and explicitly named.\n\n5. DELTA / DROP \u2014 Every ablation bar must have its drop from baseline shown as an explicit text annotation (e.g. \"\u22124.3\") in addition to the bar's own value. State the delta value for each bar. The delta = baseline value \u2212 variant value.\n\n6. ORDERING \u2014 Order the ablation bars by impact on the metric (largest drop first). State the resulting left-to-right order explicitly. The baseline bar sits at one end (leftmost) and does not break the impact ordering of the rest.\n\n7. VALUE LABELS \u2014 Every bar (including baseline) must display its metric value as a text label on/above the bar. State each value label literally.\n\nCONTENT GUIDANCE\n- Invent realistic, internally-consistent numbers if not given: baseline should be the highest value; each ablation should be lower; deltas must arithmetically match (baseline \u2212 variant).\n- Make labels and the metric name consistent everywhere (title, axis, bars).\n- Common ablation domains to ground content:\n * Data augmentations (val accuracy %): e.g. RandAugment, Mixup, CutMix, Random Erasing, Color Jitter, Horizontal Flip \u2014 name the model+dataset (e.g. ResNet-50 / ImageNet-100).\n * RAG components (answer accuracy / EM %): retrieval, reranker, query-rewrite \u2014 name the dataset (e.g. NaturalQuestions-Open).\n * Agent scaffolding (task success rate %): chain-of-thought, few-shot, tools \u2014 name the agent/model.\n\nCHECKLIST (the plan must satisfy ALL):\n[ ] Title names the specific model AND the metric.\n[ ] Y-axis title is the metric name with units.\n[ ] One bar per ablated component, each with a real value.\n[ ] Baseline shown as BOTH a labeled bar AND a labeled horizontal rule.\n[ ] Each ablation bar has an explicit delta annotation.\n[ ] Bars ordered by impact (largest drop first), order stated.\n[ ] Every bar has a value label.\n[ ] Layout is simple with headroom \u2014 no clipping/overflow, no decorative clutter.\n\nEnd the plan with a single short sentence confirming each criterion is met.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_ablation-study/report.md b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ablation-study/report.md new file mode 100644 index 0000000..4342ee3 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ablation-study/report.md @@ -0,0 +1,5 @@ +# GEPA journey: ablation-study + +rubric criteria: 7; val: 2 + +**Seed 0.2641 → Best 0.4636 (+0.1995)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_confusion-matrix/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ml_journey_confusion-matrix/best_prompts.json new file mode 100644 index 0000000..da5f98c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_confusion-matrix/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'confusion-matrix' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- An N\u00d7N grid of actual (rows) vs predicted (columns) classes\n- Each cell shows its count or rate as a text label\n- Cells are color-encoded by value with a sequential scale\n- Both axes are labeled (actual vs predicted) with the class names\n- The correct-prediction diagonal is distinguishable from off-diagonal errors\n- A color legend/scale is present\n- Title names the classifier and dataset\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_confusion-matrix/report.md b/scripts/experiments/gepa-flowchart/overnight/ml_journey_confusion-matrix/report.md new file mode 100644 index 0000000..e53becb --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_confusion-matrix/report.md @@ -0,0 +1,5 @@ +# GEPA journey: confusion-matrix + +rubric criteria: 7; val: 2 + +**Seed 0.2682 → Best 0.2682 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_eval-scorecard/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ml_journey_eval-scorecard/best_prompts.json new file mode 100644 index 0000000..ea9692e --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_eval-scorecard/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'eval-scorecard' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a visual scorecard board that evaluates a system/model against criteria. Output a CONCISE plain-text plan naming SPECIFIC content (real values, labels, structure). Generic plans fail.\n\n## RUBRIC \u2014 your plan MUST explicitly satisfy EVERY item:\n1. Each evaluation criterion is its own row.\n2. Each criterion shows a pass/partial/fail verdict, color-coded.\n3. Each criterion shows a numeric score or weight.\n4. An overall aggregate score is prominently shown.\n5. Failing criteria are visually highlighted.\n6. Each criterion has a short note or evidence string.\n7. Title names the SPECIFIC system/model under evaluation (a real-sounding name + version).\n\n## CRITICAL LESSONS (these drive the score):\n- COMPREHENSION is the hardest metric and the easiest to lose. Keep the plan SIMPLE and SCANNABLE. Do NOT over-decorate or add many extra sections, footers, mini-bars, callout boxes, and sub-captions that bury the core table. A reader must instantly grasp: what's being evaluated, each criterion's verdict+score+note, and the overall result. Favor a single clean table plus a clearly-placed aggregate.\n- GEOMETRY: content must NOT overflow or clip. Use at most 4\u20135 criterion rows. Keep note/evidence strings SHORT (one brief phrase, ~6\u201312 words). Keep cell text terse. Avoid wide tables with long strings that would wrap or spill. Prefer compact columns.\n- Do not stuff the board. Every added element risks lowering comprehension. Include exactly what the rubric needs and nothing decorative beyond simple color/highlight cues.\n\n## REQUIRED STRUCTURE (keep it to these parts, in this order):\n1. **Title** (top, prominent): names specific system/model + version, e.g. \"Safety Eval Scorecard \u2014 ChatGuard-7B v2.3\". One short subtitle line max (date/run id).\n2. **Aggregate score** (prominent, e.g. top-right banner/badge, large font): a single number + overall verdict + color, e.g. \"OVERALL: 78/100 \u2014 PARTIAL (amber)\". Optionally one short status line (e.g. \"Threshold \u226585 \u2192 BLOCKED\").\n3. **Scorecard table**: ONE ROW PER CRITERION. Columns: Criterion | Weight/Score | Verdict | Note/Evidence. Each row has a real criterion name, a numeric score and/or weight %, a color-coded verdict (\ud83d\udfe2 PASS / \ud83d\udfe1 PARTIAL / \ud83d\udd34 FAIL), and a SHORT evidence phrase with a concrete detail (a count, a percentage, a specific failure).\n4. **Color legend + failure highlight rule** (brief): state the verdict color bands and that failing rows get a distinct visual treatment (red border/tint + \u26a0 icon).\n\n## CONTENT GUIDANCE:\n- Derive 4 (max 5) concrete criteria directly from the use case.\n- Give each a plausible numeric score consistent with its verdict (PASS = green/high, PARTIAL = amber/mid, FAIL = red/low). Use score bands like PASS \u226585, PARTIAL 70\u201384, FAIL <70 (state your bands).\n- Make at least one criterion FAIL so the highlight rule is meaningful.\n- Ensure the aggregate is consistent with the rows (a weighted combination), but show the math at most once and briefly \u2014 do not let it dominate.\n- Evidence strings must be specific to the domain (real metric names, sample counts, example failure modes) but kept SHORT.\n\nOutput the plan as compact labeled sections. Be concrete, brief, and uncluttered.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_eval-scorecard/report.md b/scripts/experiments/gepa-flowchart/overnight/ml_journey_eval-scorecard/report.md new file mode 100644 index 0000000..ba2a223 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_eval-scorecard/report.md @@ -0,0 +1,5 @@ +# GEPA journey: eval-scorecard + +rubric criteria: 7; val: 2 + +**Seed 0.5989 → Best 0.6091 (+0.0102)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_ml-pipeline/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ml-pipeline/best_prompts.json new file mode 100644 index 0000000..9275d2c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ml-pipeline/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'ml-pipeline' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each pipeline stage is a clearly labeled node (ingest, features, train, eval, deploy)\n- Stages are connected in execution order with directed arrows\n- An eval/validation gate before deployment is explicitly shown\n- Data/model artifacts passed between stages are labeled\n- A branch on eval pass/fail (deploy vs retrain) is shown\n- The pipeline is laid out top-to-bottom\n- No edge passes through an unrelated stage node\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_ml-pipeline/report.md b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ml-pipeline/report.md new file mode 100644 index 0000000..a51f8d2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_ml-pipeline/report.md @@ -0,0 +1,5 @@ +# GEPA journey: ml-pipeline + +rubric criteria: 7; val: 2 + +**Seed 0.4231 → Best 0.4231 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_model-leaderboard/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ml_journey_model-leaderboard/best_prompts.json new file mode 100644 index 0000000..33a7224 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_model-leaderboard/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'model-leaderboard' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each model is its own row in a comparison table\n- Multiple eval metrics are shown as columns\n- The best value per metric (or the winning model) is flagged with a badge\n- Model size/params or cost/latency is shown for each model\n- Models are ranked/sorted by a primary metric\n- An overall winner or aggregate score is indicated\n- Title states the benchmark/dataset the models were evaluated on\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_model-leaderboard/report.md b/scripts/experiments/gepa-flowchart/overnight/ml_journey_model-leaderboard/report.md new file mode 100644 index 0000000..4bd2366 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_model-leaderboard/report.md @@ -0,0 +1,5 @@ +# GEPA journey: model-leaderboard + +rubric criteria: 7; val: 2 + +**Seed 0.8575 → Best 0.8575 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_training-curves/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ml_journey_training-curves/best_prompts.json new file mode 100644 index 0000000..36e79b7 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_training-curves/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'training-curves' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Separate train and validation curves are both plotted\n- X axis is the training step/epoch with a clear axis title\n- Y axis is the loss/metric with a clear axis title\n- The train/val divergence (overfitting gap) is visible and legible\n- The best checkpoint / early-stop point is marked (rule or point)\n- A legend distinguishes the train vs validation series\n- Chart has a descriptive title and subtitle naming the run\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ml_journey_training-curves/report.md b/scripts/experiments/gepa-flowchart/overnight/ml_journey_training-curves/report.md new file mode 100644 index 0000000..9449aaf --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ml_journey_training-curves/report.md @@ -0,0 +1,5 @@ +# GEPA journey: training-curves + +rubric criteria: 7; val: 2 + +**Seed 0.8385 → Best 0.8385 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/repair_floor/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/repair_floor/best_prompts.json new file mode 100644 index 0000000..bf3cfea --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/repair_floor/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a downstream diagram generator (a React Flow-style auto-layout engine). Your plan's quality is judged on TWO things, BOTH of which must score well:\n(1) COMPREHENSION \u2014 a newcomer can answer specific factual questions from the rendered diagram, and\n(2) VISUAL RENDERING \u2014 the diagram fits on one screen with no clipped/off-canvas nodes and no edges that cross unrelated nodes.\n\nCRITICAL LESSON FROM PAST FAILURES: Plans with perfect content still scored ~0.4 overall because the rendered board pushed 18\u201326 nodes off-canvas, clipped everything after the first pane, and produced edges that ran across unrelated nodes. The content was right but UNREADABLE. You must treat rendering geometry as a first-class constraint, not an afterthought. A smaller plan that fully renders beats a complete plan that clips.\n\nInputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nProduce a concise plain-text plan describing what the flowchart must show. Follow ALL rules below.\n\n============================================================\n=== HARD RENDERING CONSTRAINTS (these caused past failures) ===\n============================================================\n\n1. TOTAL NODE BUDGET ACROSS THE WHOLE PLAN: 12\u201318 nodes MAXIMUM, summed over ALL panes. The generator lays out every node you describe; past plans assumed each pane rendered separately and blew the budget (3 panes \u00d7 8\u20139 nodes = 24+ nodes \u2192 26 off-canvas). Count EVERY box you mention \u2014 including every decision, every terminal state, and every intermediate step \u2014 and keep the grand total at or below 18.\n\n2. PER-PANE BUDGET: each pane is 4\u20137 nodes. If you genuinely cannot fit the topic in 18 total nodes, KEEP ONLY the 2\u20133 panes that best serve the stated Purpose and explicitly drop the rest into \"Out of Scope.\" Do NOT silently include a fourth pane that will be clipped. It is better to fully cover 2 panes than to truncate 3.\n\n3. PREFER FEWER PANES. Two well-rendered panes outscore three clipped ones. Only use 3 panes if the total still fits in 18 nodes (i.e., ~6 nodes each). Never plan 4 panes.\n\n4. NO EDGE-OVER-NODE GEOMETRY. The single most common error was a decision node branching to its \"happy path\" successor while a failure/side node sat physically between them (e.g., `D1 \u2192 main_step` rendered ON TOP OF \"side_node\"). To prevent this:\n - For any decision with a \"main/continue\" branch and a \"side/failure\" branch, route the MAIN branch to the IMMEDIATELY-next node, and send the SIDE branch to a TERMINAL node placed off to the side (a leaf with no further outgoing edges). Failure/retry/escalation branches should END in a clearly-labeled terminal state, NOT route back into the main column past other nodes.\n - Do NOT create edges that skip over an intermediate node to reach a later one. Every edge should connect adjacent nodes in reading order or go to a leaf terminal.\n - Do NOT create edges that jump from one pane into the middle of another pane. Panes connect only via a single clean hand-off from the last node of one pane to the first node of the next (or, better, treat panes as fully independent \u2014 see #5).\n\n5. TREAT PANES AS INDEPENDENT, SELF-CONTAINED DIAGRAMS. Each pane should have its own Entry/Trigger and its own Terminal States. Inter-pane references should be a label (\"\u2192 continues to Pane B\"), NOT a drawn edge spanning panes. This keeps each pane laying out cleanly.\n\n6. NO LONG-RANGE OR BACK-JUMPING EDGES. Loop-backs/retries should connect to an adjacent node or resolve into a terminal \"retry exhausted \u2192 escalate\" leaf \u2014 never an arrow spanning the whole chart.\n\n7. KEEP TERMINAL STATES FEW (2\u20134 per pane) and place each as a leaf.\n\n8. Suggest a simple layout direction (usually top-to-bottom) OR let the engine choose. Do not over-specify layout.\n\n============================================================\n=== CONTENT: BE CONCRETE, NOT GENERIC ===\n============================================================\nWithin the tight node budget, every node you DO include must carry the specific detail a reader will ask about. When choosing what to keep, prioritize the nodes/details that answer the Purpose's implied factual questions (the reader will be quizzed on triggers, ordering, actors, mappings, and failure paths). For each step/decision/trigger, NAME the conventional value rather than describing it abstractly. Always specify:\n\n- TRIGGERS WITH VALUES: concrete thresholds, durations, counts, conditions (e.g., \"CPU > 80% sustained 5 min\", \"3 missed heartbeats / 15s timeout\", \"majority = N/2+1 nodes\"). State a representative real-world default; never say \"a threshold\" or \"a grace window.\"\n- EVALUATING ACTOR: for each decision, name WHO/WHAT evaluates it (the node, a leader/coordinator, a quorum service, CI, on-call engineer, incident commander). State explicitly automated vs. human-in-the-loop approval.\n- STATE/DATA DETAILS: who creates/increments/persists/validates it, where it's stored, at which step it's checked or causes rejection.\n- FAILURE PATHS: for every fallible action or external dependency, specify behavior \u2014 block, retry (how many / what backoff), escalate (to whom / after what timeout), or fail-safe/rollback. Each failure branch must terminate in an explicit leaf node; never leave it implied. (But remember: each such terminal counts against the node budget \u2014 keep them tight.)\n- ORDERING: state the explicit sequence of major operations, AND mark which steps are automated vs. manual where the audience would ask.\n- MAPPING TABLES / LEGENDS: when the topic has a fixed mapping (commit prefix \u2192 version bump, severity tiers, category \u2192 queue, impact\u00d7urgency \u2192 priority), list the FULL mapping inline IN THE LEGEND. Mapping tables live in the text legend, not as separate nodes, so they don't consume the node budget but are still answerable.\n- NEWCOMER CONTEXT: a short legend defining domain terms the audience-newcomer needs, with concrete values.\n\nPush exhaustive mappings, rate tables, SLA tables, and definitions into the LEGEND text (section 2). Reserve actual flowchart NODES for the flow logic only. This is how you stay within the node budget while keeping comprehension high.\n\n============================================================\n=== OUTPUT FORMAT (plain text, labeled sections) ===\n============================================================\n1. Purpose & Scope (1\u20132 sentences).\n2. Context for Newcomers \u2014 legend: terms + ALL fixed mappings/tables with concrete values (this carries comprehension detail without using nodes).\n3. Panes \u2014 name each pane, what it covers, and its node count. Then state the GRAND TOTAL node count and confirm it is \u2264 18.\n4. For each pane: \n - Entry/Trigger (concrete condition),\n - Key Steps/States (ordered, concrete details + responsible actor, automated vs manual),\n - Decisions & Branch Triggers (each: specific threshold/condition; evaluating actor; every branch's destination; main branch \u2192 next node, failure/timeout/retry/escalation branch \u2192 a leaf terminal),\n - Terminal States (leaves, 2\u20134).\n - Explicitly confirm no edge skips over an intermediate node and no edge crosses into another pane.\n5. Out of Scope \u2014 state explicitly what is excluded, INCLUDING any panes/details you dropped to stay within the node budget.\n\nBefore finalizing: re-count every node across all panes. If the total exceeds 18, cut content (drop a pane to Out of Scope or merge steps) until it fits. Verify every failure branch ends in a leaf and no edge runs past an intermediate node. Prioritize concrete values, named actors, and explicit failure handling over volume \u2014 but never at the cost of fitting on one screen.", + "generate": "You generate a flowchart `flow` diagram as JSON that renders cleanly in a React Flow canvas and is readable as a SINGLE screenshot without panning or zooming.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Output ONLY the JSON (no prose, no code fences).\n\n=================================================================\nTOP PRIORITY: THE IMAGE MUST FIT ON SCREEN\nThis is the #1 failure mode and it has happened in EVERY past attempt: 6\u20138 nodes\nrendered OFF the visible canvas even when the logic and labels were good. Off-canvas\nnodes destroy the visual score regardless of how correct the content is. The auto-layout\nengine spreads nodes out a LOT, so a graph that looks reasonable on paper still overflows.\nYou must therefore build a SMALLER, NARROWER, SHALLOWER graph than feels necessary.\n\nRoot causes you must prevent:\n - Too many nodes total (the layout engine spaces them far apart \u2192 off-canvas).\n - Panes stacked or placed so the second pane falls below/right of the viewport.\n - Decisions fanning out to side-leaves (widens the graph \u2192 horizontal clipping).\n - Long chains (a tall single column runs off the bottom).\n - Long labels forcing wide/tall boxes that push neighbors out of view.\n\nTreat COMPACTNESS as more important than completeness of detail. When in doubt, cut a node.\n\n=================================================================\nOUTPUT SCHEMA\n{\n \"direction\": \"TB\" | \"LR\",\n \"nodes\": [\n { \"id\": \"<unique-id>\", \"data\": { \"label\": \"<string>\", \"status\": \"<status>\" }, \"group\": \"<groupId>\" }\n ],\n \"edges\": [\n { \"source\": \"<id>\", \"target\": \"<id>\", \"data\": { \"label\": \"<condition/trigger>\" } }\n ],\n \"groups\": [\n { \"id\": \"<groupId>\", \"label\": \"<pane title>\", \"color\": \"<hex>\" }\n ]\n}\n- `status` values: \"info\" (entry/trigger), \"active\" (process/action), \"neutral\" (decision/evaluation or neutral terminal), \"warn\" (failure/error/escalation/abort), \"success\" (successful terminal).\n- Edge `data.label` is optional; use it ONLY for conditions/triggers/branch outcomes. Unconditional sequential edges need no label.\n- Every node must belong to a group. Every group needs a distinct color.\n\n=================================================================\nHARD SIZE LIMITS (these are STRICTER than before \u2014 past graphs were still too big)\n\nL1. TOTAL NODES \u2264 13. Aim for 10\u201312. If the plan lists more, CONSOLIDATE aggressively:\n - Merge sequential automated steps into one node (e.g., \"validates txn + writes WAL + enters PREPARED\").\n - Fold a single failure detail into the decision's terminal leaf rather than a separate node.\n - Drop out-of-scope detail. Never silently drop an in-scope branch \u2014 merge instead.\n\nL2. EXACTLY 2 PANES (or 1 if the plan is small). Never 3. Each pane \u2248 5\u20137 nodes.\n Keeping each pane to 5\u20137 nodes is the single most effective way to stop off-canvas overflow.\n\nL3. PANE BALANCE & DEPTH. Keep the two panes roughly equal in node count. Keep the longest\n vertical chain in any pane \u2264 6 nodes so it does not run off the bottom. If a pane's main\n spine would exceed 6, merge steps.\n\nL4. LABEL LENGTH \u2264 ~80 characters, ideally on 1 line, never more than 2 lines.\n Long labels = tall narrow boxes that overflow and clip. Pack concrete facts tersely;\n drop filler, articles, and restated context. Prefer \"OCSP/CRL revoked? (hard-fail)\"\n over a full sentence. Split a single overstuffed label into two shorter ones ONLY if\n you are still within L1. Otherwise abbreviate further.\n\nL5. WIDTH CONTROL (prevents horizontal clipping):\n - Main flow = a single vertical (TB) spine.\n - Each decision branches to AT MOST one side-leaf plus the main next node.\n - Each failing decision gets its OWN ADJACENT terminal leaf. NEVER point multiple\n decisions at one distant shared \"sink\" node \u2014 those long edges cross unrelated nodes\n and the sink lands off-canvas (this caused edge-over-node errors in past runs).\n\n=================================================================\nLAYOUT / GEOMETRY RULES\n\n1. SPLIT INTO 2 PANES (groups) per the plan. One pane = one group, distinct hex color.\n\n2. NO EDGE-OVER-NODE CROSSINGS (a scored failure). An edge to a NON-ADJACENT node visually\n cuts across nodes between them.\n - Order nodes within a pane so every edge connects ADJACENT nodes along the spine.\n - Place each decision's failure leaf IMMEDIATELY beside that decision.\n - Specific past mistakes to avoid:\n * A back-edge/loop that jumps over 2+ nodes (e.g., d4\u2192m1 over m2). If a loop-back\n would skip nodes, replace it with a label-only note (\"\u2192 reopen resets to L1, Pane A\")\n on a terminal leaf instead of drawing the edge.\n * A leaf pointing to another leaf far below it (e.g., n5\u2192n14 over n13). Each leaf is\n terminal \u2014 do not chain leaves. Put the aging/escalation outcome into the leaf's own\n label rather than drawing an edge to a distant node.\n\n3. NO EDGES BETWEEN PANES. Reference the hand-off in a node LABEL only\n (e.g., \"\u2192 continues in Pane B\"). Pane B has its own entry node. Do NOT draw the cross-pane\n edge \u2014 past runs drew n4\u2192n8 and it both crossed nodes and confused layout.\n\n4. DIRECTION. Default \"TB\". Use \"LR\" only for a short pane with many small parallel\n side-branches. Minimize crossings and overflow.\n\n=================================================================\nCONTENT / LABEL RULES (drive comprehension) \u2014 keep TERSE per L4\n\n5. LABELS MUST BE SPECIFIC, SELF-CONTAINED, and COMPACT. Bake concrete facts from the plan\n into labels:\n - Concrete numbers: thresholds, timeouts, retries, backoffs, TTLs, %s, SLAs, quorums\n (e.g., \"timeout 10s = missing vote\u2192NO\", \"retry 3\u00d7/5s backoff\", \"quorum N/2+1\",\n \"TTL 24h auto-release\", \"skew \u00b15min\").\n - WHO performs each step (actor/component): \"Leader evaluates\", \"WMS (auto)\", \"Compliance\n Officer\", \"L1 agent\", \"Coordinator (TC)\". Readers consistently ask which actor owns a step.\n - PER-LEVEL / PER-CATEGORY criteria when the plan supplies a table or matrix. Naming the\n gate is NOT enough \u2014 readers downgraded comprehension when thresholds were only named.\n If a full matrix won't fit in a label, put the most decision-relevant numbers in the\n decision node and the rest in the entry/triage node (e.g., \"Set priority P1\u2013P4: P1=High\n impact\u00d7High urgency, resolve 4h; P4=Low, 5 biz days\"). Abbreviate hard but keep the\n actual numbers.\n - For gates/checklists, ENUMERATE the actual criteria abbreviated\n (e.g., \"trusted root + dates valid + not revoked + SAN match\", \"score\u226550, COI\u2265$1M, no sanctions\").\n - Failure/edge-case paths must appear as explicit NODES/EDGES (not implied): timeouts,\n retries-exhausted, dependency unreachable, write failure, rollback/abort, SLA-breach\n escalation, stale-leader step-down, in-doubt recovery. Keep their labels short but present.\n\n6. DECISION NODES: phrase as a question; label EACH outgoing edge with the branch outcome\n (e.g., \"all YES\", \"any NO/timeout\", \"FULL\", \"PARTIAL\", \"out of range\"). Edge labels were\n noted as missing/faint \u2014 always include them and keep them to a few words.\n\n7. TERMINAL STATES: explicit nodes with proper status (\"success\", \"warn\", \"neutral\").\n Prefix \"TERMINAL:\" or \"ABORT:\". Keep terminal labels short.\n\n8. COVER THE PLAN'S IN-SCOPE ELEMENTS: entry triggers, key steps, every in-scope decision/branch,\n all terminal states \u2014 but achieve this by MERGING, not by adding nodes past L1. Don't invent\n steps beyond scope (e.g., snapshot fallback, HelloRetryRequest) unless the plan lists them.\n\n=================================================================\nPROCESS (follow before emitting)\n- Map plan panes \u2192 exactly 2 groups, assign distinct hex colors.\n- List all required nodes. If > 13, CONSOLIDATE (merge sequential/automated steps; fold detail\n into terminal leaves) until \u2264 13, with each pane at 5\u20137 nodes and balanced.\n- Within each pane, order nodes so every edge connects adjacent nodes along a single vertical spine.\n- Give each failing decision its OWN adjacent terminal leaf. No distant shared sink. No leaf\u2192leaf\n edges. No cross-pane edges (use a label). No loop-back edge that skips 2+ nodes (use a label).\n- Shorten every label to \u2264 ~80 chars / 1 line where possible, preserving concrete numbers, the\n actor, and per-level criteria.\n- Final mental re-check: With generous auto-layout spacing, would 13 nodes in 2 panes of 5\u20137 each,\n longest chain \u2264 6, fit one screen? Would any edge cross an unrelated node? Would any box clip?\n If any doubt \u2192 cut nodes and shorten labels BEFORE emitting.\n\nOutput ONLY the final JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/repair_floor/report.md b/scripts/experiments/gepa-flowchart/overnight/repair_floor/report.md new file mode 100644 index 0000000..b290493 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/repair_floor/report.md @@ -0,0 +1,15 @@ +# GEPA flowchart optimization — repair_floor (RECOVERED from gepa_state) + +- iterations: 6 total evals: 138 candidates: 6 + +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) | 0.6761 | 0.795 | 0.500 | 0.832 | [None] | +| 1 | 0.6632 | 0.761 | 0.501 | 0.876 | [0] | +| 2 | 0.6618 | 0.737 | 0.517 | 0.906 | [1] | +| 3 | 0.6293 | 0.663 | 0.537 | 0.866 | [2] | +| 4 | 0.6742 | 0.690 | 0.616 | 0.919 | [2] | +| 5 **BEST** | 0.7001 | 0.844 | 0.500 | 0.878 | [0] | + +**Seed 0.6761 → Best (idx 5) 0.7001 (+0.0240)** + diff --git a/scripts/experiments/gepa-flowchart/overnight/repair_loop/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/repair_loop/best_prompts.json new file mode 100644 index 0000000..40a19a2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/repair_loop/best_prompts.json @@ -0,0 +1,5 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a downstream diagram generator (a React Flow-style auto-layout engine). Your plan's quality is judged on TWO things, BOTH of which must score well:\n(1) COMPREHENSION \u2014 a newcomer can answer specific factual questions from the rendered diagram, and\n(2) VISUAL RENDERING \u2014 the diagram fits on one screen with no clipped/off-canvas nodes and no edges that cross unrelated nodes.\n\nCRITICAL LESSON FROM PAST FAILURES: Plans with perfect content still scored ~0.4 overall because the rendered board pushed 18\u201326 nodes off-canvas, clipped everything after the first pane, and produced edges that ran across unrelated nodes. The content was right but UNREADABLE. You must treat rendering geometry as a first-class constraint, not an afterthought. A smaller plan that fully renders beats a complete plan that clips.\n\nInputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nProduce a concise plain-text plan describing what the flowchart must show. Follow ALL rules below.\n\n============================================================\n=== HARD RENDERING CONSTRAINTS (these caused past failures) ===\n============================================================\n\n1. TOTAL NODE BUDGET ACROSS THE WHOLE PLAN: 12\u201318 nodes MAXIMUM, summed over ALL panes. The generator lays out every node you describe; past plans assumed each pane rendered separately and blew the budget (3 panes \u00d7 8\u20139 nodes = 24+ nodes \u2192 26 off-canvas). Count EVERY box you mention \u2014 including every decision, every terminal state, and every intermediate step \u2014 and keep the grand total at or below 18.\n\n2. PER-PANE BUDGET: each pane is 4\u20137 nodes. If you genuinely cannot fit the topic in 18 total nodes, KEEP ONLY the 2\u20133 panes that best serve the stated Purpose and explicitly drop the rest into \"Out of Scope.\" Do NOT silently include a fourth pane that will be clipped. It is better to fully cover 2 panes than to truncate 3.\n\n3. PREFER FEWER PANES. Two well-rendered panes outscore three clipped ones. Only use 3 panes if the total still fits in 18 nodes (i.e., ~6 nodes each). Never plan 4 panes.\n\n4. NO EDGE-OVER-NODE GEOMETRY. The single most common error was a decision node branching to its \"happy path\" successor while a failure/side node sat physically between them (e.g., `D1 \u2192 main_step` rendered ON TOP OF \"side_node\"). To prevent this:\n - For any decision with a \"main/continue\" branch and a \"side/failure\" branch, route the MAIN branch to the IMMEDIATELY-next node, and send the SIDE branch to a TERMINAL node placed off to the side (a leaf with no further outgoing edges). Failure/retry/escalation branches should END in a clearly-labeled terminal state, NOT route back into the main column past other nodes.\n - Do NOT create edges that skip over an intermediate node to reach a later one. Every edge should connect adjacent nodes in reading order or go to a leaf terminal.\n - Do NOT create edges that jump from one pane into the middle of another pane. Panes connect only via a single clean hand-off from the last node of one pane to the first node of the next (or, better, treat panes as fully independent \u2014 see #5).\n\n5. TREAT PANES AS INDEPENDENT, SELF-CONTAINED DIAGRAMS. Each pane should have its own Entry/Trigger and its own Terminal States. Inter-pane references should be a label (\"\u2192 continues to Pane B\"), NOT a drawn edge spanning panes. This keeps each pane laying out cleanly.\n\n6. NO LONG-RANGE OR BACK-JUMPING EDGES. Loop-backs/retries should connect to an adjacent node or resolve into a terminal \"retry exhausted \u2192 escalate\" leaf \u2014 never an arrow spanning the whole chart.\n\n7. KEEP TERMINAL STATES FEW (2\u20134 per pane) and place each as a leaf.\n\n8. Suggest a simple layout direction (usually top-to-bottom) OR let the engine choose. Do not over-specify layout.\n\n============================================================\n=== CONTENT: BE CONCRETE, NOT GENERIC ===\n============================================================\nWithin the tight node budget, every node you DO include must carry the specific detail a reader will ask about. When choosing what to keep, prioritize the nodes/details that answer the Purpose's implied factual questions (the reader will be quizzed on triggers, ordering, actors, mappings, and failure paths). For each step/decision/trigger, NAME the conventional value rather than describing it abstractly. Always specify:\n\n- TRIGGERS WITH VALUES: concrete thresholds, durations, counts, conditions (e.g., \"CPU > 80% sustained 5 min\", \"3 missed heartbeats / 15s timeout\", \"majority = N/2+1 nodes\"). State a representative real-world default; never say \"a threshold\" or \"a grace window.\"\n- EVALUATING ACTOR: for each decision, name WHO/WHAT evaluates it (the node, a leader/coordinator, a quorum service, CI, on-call engineer, incident commander). State explicitly automated vs. human-in-the-loop approval.\n- STATE/DATA DETAILS: who creates/increments/persists/validates it, where it's stored, at which step it's checked or causes rejection.\n- FAILURE PATHS: for every fallible action or external dependency, specify behavior \u2014 block, retry (how many / what backoff), escalate (to whom / after what timeout), or fail-safe/rollback. Each failure branch must terminate in an explicit leaf node; never leave it implied. (But remember: each such terminal counts against the node budget \u2014 keep them tight.)\n- ORDERING: state the explicit sequence of major operations, AND mark which steps are automated vs. manual where the audience would ask.\n- MAPPING TABLES / LEGENDS: when the topic has a fixed mapping (commit prefix \u2192 version bump, severity tiers, category \u2192 queue, impact\u00d7urgency \u2192 priority), list the FULL mapping inline IN THE LEGEND. Mapping tables live in the text legend, not as separate nodes, so they don't consume the node budget but are still answerable.\n- NEWCOMER CONTEXT: a short legend defining domain terms the audience-newcomer needs, with concrete values.\n\nPush exhaustive mappings, rate tables, SLA tables, and definitions into the LEGEND text (section 2). Reserve actual flowchart NODES for the flow logic only. This is how you stay within the node budget while keeping comprehension high.\n\n============================================================\n=== OUTPUT FORMAT (plain text, labeled sections) ===\n============================================================\n1. Purpose & Scope (1\u20132 sentences).\n2. Context for Newcomers \u2014 legend: terms + ALL fixed mappings/tables with concrete values (this carries comprehension detail without using nodes).\n3. Panes \u2014 name each pane, what it covers, and its node count. Then state the GRAND TOTAL node count and confirm it is \u2264 18.\n4. For each pane: \n - Entry/Trigger (concrete condition),\n - Key Steps/States (ordered, concrete details + responsible actor, automated vs manual),\n - Decisions & Branch Triggers (each: specific threshold/condition; evaluating actor; every branch's destination; main branch \u2192 next node, failure/timeout/retry/escalation branch \u2192 a leaf terminal),\n - Terminal States (leaves, 2\u20134).\n - Explicitly confirm no edge skips over an intermediate node and no edge crosses into another pane.\n5. Out of Scope \u2014 state explicitly what is excluded, INCLUDING any panes/details you dropped to stay within the node budget.\n\nBefore finalizing: re-count every node across all panes. If the total exceeds 18, cut content (drop a pane to Out of Scope or merge steps) until it fits. Verify every failure branch ends in a leaf and no edge runs past an intermediate node. Prioritize concrete values, named actors, and explicit failure handling over volume \u2014 but never at the cost of fitting on one screen.", + "generate": "You generate a flowchart `flow` diagram as JSON that renders cleanly in a React Flow canvas and is readable as a SINGLE screenshot without panning or zooming.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Output ONLY the JSON (no prose, no code fences).\n\n=================================================================\nTOP PRIORITY: THE IMAGE MUST FIT ON SCREEN\nThis is the single most important and most-failed requirement. In past renders, logic was perfect but the IMAGE FAILED: nodes rendered OFF the visible canvas (7\u20139 of 13 nodes off-screen in multiple cases), node-pair OVERLAPS, boxes CLIPPED at right/bottom edges, and edge labels TOO FAINT/SMALL to read. A \"clean\" abstract geometry check does NOT mean it rendered on-screen \u2014 the auto-layout pushes content off-canvas when there are too many nodes, the graph is too wide, or labels are too long.\n\nThe fixes that matter most, in priority order:\n 1. FEWER NODES (the biggest lever \u2014 see L1).\n 2. SHORTER LABELS (long labels = tall/wide boxes that overflow \u2014 see L3).\n 3. A NARROW, SHORT layout (see DIRECTION and WIDTH rules).\nTreat compactness as MORE important than completeness of any single label.\n\n=================================================================\nOUTPUT SCHEMA\n{\n \"direction\": \"TB\" | \"LR\",\n \"nodes\": [\n { \"id\": \"<unique-id>\", \"data\": { \"label\": \"<string>\", \"status\": \"<status>\" }, \"group\": \"<groupId>\" }\n ],\n \"edges\": [\n { \"source\": \"<id>\", \"target\": \"<id>\", \"data\": { \"label\": \"<condition/trigger>\" } }\n ],\n \"groups\": [\n { \"id\": \"<groupId>\", \"label\": \"<pane title>\", \"color\": \"<hex>\" }\n ]\n}\n- Use EXACTLY these keys. Do NOT add top-level keys like \"panes\" \u2014 group membership is expressed only via each node's \"group\" field.\n- `status` values: \"info\" (entry/trigger points), \"active\" (process/action steps), \"neutral\" (decision/evaluation nodes or neutral terminals), \"warn\" (failure/error/escalation/abort), \"success\" (successful terminals).\n- Edge `data.label` is optional; use it ONLY for conditions/triggers/branch outcomes. Unconditional sequential edges need no label.\n- Every node must belong to a group. Every group needs a distinct color.\n\n=================================================================\nHARD SIZE LIMITS (do not exceed \u2014 these prevent off-canvas/overlap failures)\n\nL1. TOTAL NODES \u2264 13. Aim for 10\u201312. Fewer nodes is the most reliable way to keep everything on canvas. If the plan lists more, CONSOLIDATE steps (merge sequential automated steps; fold a short remediation into its terminal label) and drop only out-of-scope detail \u2014 never silently drop in-scope branches. Do NOT pad to a target count: if the plan needs 10 nodes, emit 10.\n\nL2. EXACTLY 2 PANES MAXIMUM (1 is fine for small flows). Never 3 panes. Keep ~5\u20137 nodes per pane. If the plan defines a 3rd pane mostly for shared failure/abort terminals, do NOT add it \u2014 keep each abort terminal as a leaf inside the pane that triggers it (see L4).\n\nL3. LABEL LENGTH: keep each node label \u2264 ~70 characters, on 1 line where possible, never more than 2 short lines. Long labels become tall/wide boxes that overflow and get clipped. Pack concrete facts tersely; drop filler words, articles, restated context. Prefer \"OCSP/CRL revoked? (hard-fail)\" over a full sentence. Use abbreviations the audience knows. Fold multi-step remediation into a compact terminal (e.g., \"DENY: log reason + send template\" not a paragraph).\n\nL4. WIDTH CONTROL. A pane that fans out into side-leaves or grows too wide gets clipped on the right.\n - Keep the main flow a single vertical (TB) spine.\n - Each decision branches to AT MOST one side-leaf (a failure/terminal) PLUS the main next node. Place the side-leaf immediately beside its decision.\n - Do NOT create one shared distant \"sink\" node that many decisions point to \u2014 those long edges cross unrelated nodes. Give each failing decision its OWN adjacent terminal leaf.\n\n=================================================================\nLAYOUT RULES (drive the geometry/visual score)\n\nR1. DIRECTION \u2014 DEFAULT TO \"TB\". Use \"TB\" (top-to-bottom) for essentially all flows. TB stacks the two panes/columns and keeps the rendered width small. \"LR\" repeatedly caused nodes to run off the RIGHT edge in past renders \u2014 do NOT use \"LR\" unless a pane is very short (\u22644 nodes) with no second pane. When in doubt, use \"TB\".\n\nR2. KEEP IT SHORT AS WELL AS NARROW. With 2 panes in TB, the combined height can still run off the BOTTOM. This is why L1 (\u226413 nodes) and L3 (short labels) matter \u2014 tall boxes from long labels compound the overflow. If you sense the flow is long, cut nodes before emitting.\n\nR3. NO EDGE-OVER-NODE CROSSINGS. An edge to a NON-ADJACENT node visually cuts across nodes between them.\n - Order nodes within a pane so edges connect ADJACENT nodes; lay the main path out linearly.\n - Branch edges (decision \u2192 terminal) must go to an ADJACENT leaf. Never route over 2+ intermediate nodes.\n - Loop-back edges are allowed ONLY when source and target are adjacent/near-adjacent. If a back-edge jumps over 2+ nodes, restructure.\n\nR4. CROSS-PANE HANDOFF: do NOT draw edges between panes. Reference the handoff in a node label (e.g., \"\u2192 continues in Pane B\"). Pane B has its own entry node.\n\n=================================================================\nEDGE LABELS \u2014 MUST BE PRESENT AND LEGIBLE\nEdge labels were repeatedly flagged as missing or too faint to read in the rendered image (e.g., decision branch outcomes invisible). To maximize legibility:\n - Label EVERY outgoing edge of a decision node with the branch outcome, kept VERY SHORT (1\u20133 words): \"alive\", \"timeout\", \"valid IP\", \"NO $201+\", \"top 25%\", \"SLA expired\".\n - Do NOT put long phrases in edge labels \u2014 short labels render larger and clearer.\n - Unconditional sequential edges get NO label (omit it) so the decision-branch labels stand out.\n\n=================================================================\nCONTENT / LABEL RULES (drive comprehension) \u2014 keep them TERSE per L3\n\nC1. LABELS MUST BE SPECIFIC AND SELF-CONTAINED, but compact. Bake concrete facts from the plan directly into labels:\n - Concrete numbers: thresholds, timeouts, retries, backoffs, TTLs, %s, SLAs, quorums, caps (e.g., \"timeout 150ms = 3 missed beats\", \"quorum N/2+1\", \"SLA 10 biz days\", \"cap $200\", \"30-day window\", \"rungs 1/4/16ep, \u03b7=4\").\n - WHO performs each step (actor/component): e.g., \"Leader\", \"Client TLS lib\", \"Compliance Officer\", \"Rep\", \"Orchestrator\". Readers consistently ask which actor owns a step \u2014 include it, abbreviated.\n - For gates/checklists, ENUMERATE the actual criteria, abbreviated (e.g., \"evidence valid + in 30d window\", \"score\u226550, COI\u2265$1M, no sanctions\"). Naming the gate alone is insufficient.\n - When the plan specifies a metric/threshold for a decision (e.g., the val-metric used for early-stop), name it in the decision label, not just the policy name.\n\nC2. DECISION NODES: phrase as a question; label EACH outgoing edge with the short branch outcome (see EDGE LABELS).\n\nC3. TERMINAL STATES: explicit nodes with proper status (\"success\" good ends, \"warn\" error/abort ends, \"neutral\" held/archived). Prefix clearly: \"TERMINAL: ...\" or \"ABORT: ...\". Keep terminal labels short; fold remediation into a few words.\n\nC4. COVER THE PLAN'S SCOPE within the node budget: entry triggers, key steps, every in-scope decision/branch, all terminal states. Failure/edge-case paths must appear as explicit NODES/EDGES (timeouts, retries-exhausted, dependency unreachable, write failure, rollback/abort, escalation when SLA exceeded, stale-leader step-down). Don't invent steps beyond scope; don't drop in-scope branches.\n\n=================================================================\nPROCESS (follow before emitting)\n- Set direction = \"TB\".\n- Map plan panes \u2192 groups (\u2264 2), assign distinct hex colors (e.g., #2563eb and #16a34a or #9333ea).\n- List all required nodes; if > 13, consolidate until \u2264 13 (target 10\u201312).\n- Within each pane, order nodes along a single vertical spine so edges stay short and adjacent.\n- Give each failing decision its OWN adjacent terminal leaf (no distant shared sink, no abort pane).\n- Shorten every label to \u2264 ~70 chars / 1 line while preserving the concrete numbers and actor.\n- Make every decision-branch edge label 1\u20133 words; remove labels on plain sequential edges.\n- Mentally re-check rendering: With this many nodes and these label lengths in TB, will the LAST nodes of Pane B fit on one screen, or will they fall off the bottom/right? If at risk, CUT nodes and SHORTEN labels before emitting. Would any edge cross an unrelated node? If yes, restructure.\n\nOutput ONLY the final JSON object.", + "fix": "You are repairing a flowchart `flow` JSON so it RENDERS cleanly on one screen. A renderer drew the current diagram and reported concrete problems. Fix ONLY layout/legibility \u2014 do not drop or weaken the content (keep every node, edge, label, and its specific detail).\n\nCurrent flow JSON:\n{flow_json}\n\nMeasured problems from the actual render:\n{findings}\n\nRevise the JSON to eliminate those specific problems. Levers you may use: shorten overly long labels (keep the concrete facts), choose a better `direction`, split into groups/panes so dagre packs them, give each failing decision its own ADJACENT terminal leaf (no distant shared sink), reduce node count only by merging trivially-sequential steps (never by dropping a branch). Output ONLY the revised flow JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/repair_loop/report.md b/scripts/experiments/gepa-flowchart/overnight/repair_loop/report.md new file mode 100644 index 0000000..240f911 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/repair_loop/report.md @@ -0,0 +1,13 @@ +# GEPA flowchart optimization — repair_loop (RECOVERED from gepa_state) + +- iterations: 13 total evals: 148 candidates: 4 + +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) | 0.5809 | 0.636 | 0.479 | 0.691 | [None] | +| 1 | 0.5904 | 0.658 | 0.468 | 0.744 | [0] | +| 2 | 0.5693 | 0.636 | 0.458 | 0.706 | [0] | +| 3 **BEST** | 0.5924 | 0.647 | 0.468 | 0.754 | [0] | + +**Seed 0.5809 → Best (idx 3) 0.5924 (+0.0115)** + diff --git a/scripts/experiments/gepa-flowchart/overnight/reweighted_viz/report.md b/scripts/experiments/gepa-flowchart/overnight/reweighted_viz/report.md new file mode 100644 index 0000000..571057e --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/reweighted_viz/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: er-diagram, class-diagram, state-machine, build-pipeline, shopping-comparison, product-comparison, metrics-dashboard, observability-dashboard, recipe, explainer, app-screen-mockup, training-curves +shared skills: artifact_note, board_layout, chart_internal, comparison_grid, dashboard_grid, graph_entity_lanes, graph_process_spine, report_rows, screen_frame +shared skills CHANGED by GEPA: (none) + +**Seed 0.4998 -> Best 0.5265 (+0.0267)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_build-pipeline/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_build-pipeline/best_prompts.json new file mode 100644 index 0000000..e318f39 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_build-pipeline/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'build-pipeline' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each pipeline stage (build, test, scan, deploy) is a node\n- Stages are colored by status: passed=green, running=amber, failed=red\n- Stages connect in execution order with arrowheads\n- Stages show a duration or step detail in a meta/sub line\n- If a stage fails, it is clearly red and the downstream stages reflect being blocked/skipped\n- A legend maps status colors to passed/running/failed\n- No stage nodes overlap and edges do not pass through unrelated nodes\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_build-pipeline/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_build-pipeline/report.md new file mode 100644 index 0000000..0d00176 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_build-pipeline/report.md @@ -0,0 +1,5 @@ +# GEPA journey: build-pipeline + +rubric criteria: 7; val: 2 + +**Seed 0.7078 → Best 0.7078 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_class-diagram/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_class-diagram/best_prompts.json new file mode 100644 index 0000000..b3080a2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_class-diagram/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'class-diagram' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are producing a concrete, render-ready plan for a UML class diagram. It is judged\nagainst a strict rubric. A great board MUST satisfy EVERY criterion below. The judge\nchecks the ACTUAL specified structure, not your claims \u2014 so make every criterion\nprovable by the explicit content you list. Do not write \"compliance check\" sections that\nmerely assert success; instead bake the proof into concrete values.\n\nRUBRIC CRITERIA AND HOW TO GUARANTEE EACH:\n\n1. CLASS BOXES (titled box, attribute rows, method() rows)\n - Give every class a title row, then a clearly separated attribute section, then a\n clearly separated method section. Use the standard 3-compartment UML layout:\n +------------------+\n | ClassName | <- title compartment\n +------------------+\n | - attr: Type | <- attribute compartment\n +------------------+\n | + method(): Type | <- method compartment\n +------------------+\n - EVERY class must have at least one real attribute AND at least one real method.\n A box with no attributes OR no methods loses points. Even leaf subclasses must\n list their own concrete attributes (do not leave a child with only inherited\n methods and no attributes).\n\n2. METHODS DISTINCT FROM ATTRIBUTES\n - Every method name MUST end in parentheses with its parameter list, e.g.\n `+ authorize(amount: Decimal): bool`. Always include `()` even when empty.\n - Attributes MUST NOT have parentheses: `- balance: Decimal`.\n - Keep methods and attributes in physically separate compartments so the\n distinction is structural, not just textual.\n\n3. INHERITANCE DIRECTION \u2014 PARENTS ABOVE CHILDREN (BT orientation)\n - This is the most commonly failed criterion. \"BT (bottom-to-top)\" means the\n generalization EDGES POINT UPWARD from child to parent, which renders the PARENT\n ABOVE its children. State this explicitly AND back it with coordinates.\n - Assign every class an explicit (x, y) where y is the vertical position. Use a\n coordinate system where the PARENT has the SMALLEST y if y grows downward\n (top of canvas), and children have LARGER y (lower on canvas). State your axis\n convention in one sentence: e.g. \"y increases downward; parent at smallest y\".\n - The arrowhead (hollow triangle) must sit at the PARENT end, and the parent must\n be vertically above all its children. Never place a parent at the same row as or\n below its children.\n\n4. RELATIONSHIP EDGES WITH ARROWHEADS\n - List every edge explicitly as: Source --> Target, edge type, arrowhead type and\n location. Use correct UML notation:\n * Generalization/inheritance: solid line, HOLLOW TRIANGLE at parent end.\n Notation: Child \u2500\u2500\u25b7 Parent\n * Association: solid line, open arrow (or none) \u2014 label the role/multiplicity.\n * Composition: filled diamond \u25c6 at the WHOLE end. Notation: Whole \u25c6\u2500\u2500 Part.\n * Aggregation: hollow diamond \u25c7.\n - Every edge needs a clearly named arrowhead and a clearly named source and target.\n\n5. NO OVERLAP / NO EDGES THROUGH UNRELATED BOXES\n - Give explicit coordinates and box sizes so spacing is unambiguous.\n - Space sibling children far enough apart that vertical edges run in the gaps\n between boxes. Set the parent box width >= the longest text string it contains\n so text never clips.\n - Route inheritance edges to fan from one shared point under the parent so they do\n not pass through sibling boxes. Place any extra (association/composition)\n classes off to the side with edges that curve around, not through, the hierarchy.\n\n6. HIERARCHY READABLE AT A GLANCE (clear tree, not a tangle)\n - Use a single parent at top, children in one evenly-spaced row directly below.\n Include a small ASCII sketch of the tree to make the structure unmistakable.\n\n7. EVERY CLASS CONNECTED (no orphan)\n - Confirm each class is an endpoint of at least one edge. If you introduce a helper\n class (e.g. a Container, a Receipt, an enum), it MUST have an edge to the\n hierarchy \u2014 otherwise omit it.\n\nCONTENT GUIDANCE:\n- Be specific and real: concrete attribute types, concrete parameter lists, concrete\n return types, real domain values. Generic placeholder plans fail.\n- Prefer richer, fully-populated boxes: aim for 3-5 attributes and 3-5 methods on the\n base class, and at least 1-2 OWN attributes plus 2+ methods on each child. Thin\n boxes (especially attribute-less children) are penalized.\n- For \"what to override\" goals, mark abstract methods on the parent and show each child\n redeclaring them, but ALSO give each child its own distinct attributes/methods so the\n boxes are not near-duplicates.\n- Add a short reader-facing takeaway tied to the stated Goal.\n\nOUTPUT FORMAT (concise plain text):\n- Axis convention + orientation statement (one line).\n- ASCII tree sketch of the hierarchy.\n- For each class: title, explicit (x, y) position and approximate box width, attribute\n rows, method() rows.\n- Explicit edge list: Source, Target, type, arrowhead symbol + end location.\n- A brief routing/spacing note proving no overlap and no edge crossing unrelated boxes.\n- A one-line reader takeaway.\n\nDo not pad with self-congratulatory checklists. Make the concrete content itself the\nproof that every rubric item holds.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_class-diagram/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_class-diagram/report.md new file mode 100644 index 0000000..acc4568 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_class-diagram/report.md @@ -0,0 +1,5 @@ +# GEPA journey: class-diagram + +rubric criteria: 7; val: 2 + +**Seed 0.7847 → Best 0.7984 (+0.0138)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_critical-path/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_critical-path/best_prompts.json new file mode 100644 index 0000000..113f045 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_critical-path/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'critical-path' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Tasks are laid out left-to-right reflecting the time/schedule axis\n- The critical path tasks are highlighted red with thick edges and a 'bolt'-style icon (not read as 'failed')\n- Tasks with slack are shown neutral/grey with dashed edges, distinct from the critical chain\n- Each task shows its duration (e.g. '6d') in a meta line\n- Each task shows its owner (and optionally slack) in a sub line\n- A two-row legend distinguishes critical path (red) from has-slack (grey-dash)\n- No task nodes overlap and edges do not pass through unrelated nodes\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_critical-path/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_critical-path/report.md new file mode 100644 index 0000000..da90d54 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_critical-path/report.md @@ -0,0 +1,5 @@ +# GEPA journey: critical-path + +rubric criteria: 7; val: 2 + +**Seed 0.6008 → Best 0.6008 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_er-diagram/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_er-diagram/best_prompts.json new file mode 100644 index 0000000..7a5389a --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_er-diagram/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'er-diagram' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each table/entity is a titled box listing its fields as rows\n- Fields show their data types and key fields (PK/FK) are marked\n- Edges between entities are labeled with cardinality (e.g. 1..*, places, has)\n- Relationship edges have visible arrowheads/markers\n- The schema is laid out top-to-bottom\n- Entity boxes do not overlap and edges do not cross through unrelated entity boxes\n- Every entity participates in at least one labeled relationship (no orphan tables)\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_er-diagram/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_er-diagram/report.md new file mode 100644 index 0000000..9b54245 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_er-diagram/report.md @@ -0,0 +1,5 @@ +# GEPA journey: er-diagram + +rubric criteria: 7; val: 2 + +**Seed 0.5602 → Best 0.5602 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_okr-tree/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_okr-tree/best_prompts.json new file mode 100644 index 0000000..62aaee5 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_okr-tree/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'okr-tree' board that will be rendered by an automated system into an actual diagram.\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nCRITICAL: Your output is PARSED BY A RENDERER, not read as prose. The parser detects\nrubric facts by matching exact literal field keys and values. Any deviation in field\nnaming, spelling, or value vocabulary causes that rubric criterion to score 0. Do NOT\nuse markdown headers, bold, emoji, or decorative formatting. Emit ONLY the plain-text\nstructured spec described below.\n\n=================================================================\nHARD RUBRIC REQUIREMENTS \u2014 satisfy EVERY one (each is machine-checked)\n=================================================================\n\n(R1) STRUCTURE: Exactly 1 objective, exactly 3 key results, exactly 2 initiatives per\n key result (6 initiatives). Total = 10 nodes and 9 edges. Three levels flow strictly\n downward: Objective (top) -> Key Results (middle) -> Initiatives (bottom).\n\n(R2) KIND TAGS: every node has a `kind:` field whose value is EXACTLY one of these three\n literal strings (lowercase, with the space):\n kind: objective\n kind: key result\n kind: initiative\n Use no other spelling (not \"Objective\", not \"KR\", not \"key-result\").\n\n(R3) PROGRESS META: every objective node AND every key-result node MUST have a `meta:` line\n containing a literal percent token like `62%` (digits immediately followed by `%`).\n Initiative meta lines must NOT contain a percent token.\n\n(R4) STATUS + COLOR: every node has BOTH a `status:` field and a `color:` field. The values\n must come from this exact vocabulary and be mapped exactly:\n status: on-track -> color: green\n status: at-risk -> color: amber\n status: behind -> color: red\n Spell `on-track`, `at-risk`, `behind` exactly (lowercase, hyphenated). Spell colors\n `green`, `amber`, `red` exactly (lowercase). On every node the color MUST be the one\n mapped to that node's status.\n\n(R5) EDGE COLORS: every edge has a `color:` value equal to the color of its TARGET (child)\n node's status. Objective->KR edge uses the KR's color. KR->Initiative edge uses the\n initiative's color. Double-check each edge color literally equals the child node's\n color string.\n\n(R6) LEGEND: the output MUST begin with a legend block containing these three literal\n lines, exactly:\n green = on-track\n amber = at-risk\n red = behind\n\n(R7) GEOMETRY: place every node on a 1200 x 800 canvas (origin top-left). Keep ALL nodes\n well inside bounds \u2014 because nodes have width/height, use a SAFE inner margin:\n x in [120, 1080] y in [60, 740]\n Use these row heights: objective y=80, key results y=350, initiatives y=650.\n Spread KRs across x at 250 / 600 / 950. Place each KR's two initiatives near its x\n but keep them inside the safe margin. Use these initiative x-values (already safe):\n KR1 (x=250): I1 x=160, I2 x=340\n KR2 (x=600): I3 x=510, I4 x=690\n KR3 (x=950): I5 x=860, I6 x=1040\n Never use x>1080 or x<120 (values like 1060/1070 are OFF-CANVAS and fail).\n\n=================================================================\nOUTPUT FORMAT \u2014 reproduce this structure EXACTLY (same keys, same order)\n=================================================================\n\nLEGEND:\n- green = on-track\n- amber = at-risk\n- red = behind\n\nNODES:\n[O1]\n kind: objective\n label: <concrete objective title with real numbers>\n status: <on-track|at-risk|behind>\n color: <green|amber|red>\n meta: <progress with a percent token, e.g. \"58% complete\">\n pos: x=600, y=80\n\n[KR1]\n kind: key result\n label: <concrete KR with current value, target value>\n status: <on-track|at-risk|behind>\n color: <green|amber|red>\n meta: <current / target \u2014 NN%>\n pos: x=250, y=350\n\n[KR2]\n kind: key result\n ...\n pos: x=600, y=350\n\n[KR3]\n kind: key result\n ...\n pos: x=950, y=350\n\n[I1]\n kind: initiative\n label: <concrete initiative name>\n status: <on-track|at-risk|behind>\n color: <green|amber|red>\n meta: <short status note, NO percent token>\n pos: x=160, y=650\n\n[I2] ... pos: x=340, y=650 (child of KR1)\n[I3] ... pos: x=510, y=650 (child of KR2)\n[I4] ... pos: x=690, y=650 (child of KR2)\n[I5] ... pos: x=860, y=650 (child of KR3)\n[I6] ... pos: x=1040, y=650 (child of KR3)\n\nEDGES:\n- O1 -> KR1 color: <KR1 color>\n- O1 -> KR2 color: <KR2 color>\n- O1 -> KR3 color: <KR3 color>\n- KR1 -> I1 color: <I1 color>\n- KR1 -> I2 color: <I2 color>\n- KR2 -> I3 color: <I3 color>\n- KR2 -> I4 color: <I4 color>\n- KR3 -> I5 color: <I5 color>\n- KR3 -> I6 color: <I6 color>\n\nTAKEAWAY:\n<1-2 sentences tailored to the Reader and Goal, naming the specific behind/red (or amber)\nbranch driving the objective down and the concrete initiative(s) to act on.>\n\n=================================================================\nCONTENT GUIDANCE\n=================================================================\n- Make every label specific to the use case with REAL values (numbers, %, dollar figures,\n feature/service names). Generic labels score low on comprehension.\n- Give KR/initiative meta lines concrete current/target numbers or status notes.\n- Use a realistic MIX of statuses so colors are meaningful. The objective's color should\n reflect the rollup of its KRs (if any KR is red the objective is typically amber or red;\n if all green, objective is green).\n- The TAKEAWAY must reference the actual node labels/branches you created.\n\n=================================================================\nFINAL SELF-CHECK before emitting (verify silently, then output):\n=================================================================\n1. Legend block present with the 3 exact lines.\n2. 10 nodes total: 1 objective + 3 KRs + 6 initiatives. 9 edges total.\n3. Every node has kind / label / status / color / pos lines, in that order.\n4. Every kind value is exactly: objective | key result | initiative.\n5. Every status is on-track|at-risk|behind and its color matches the R4 mapping.\n6. Objective and all 3 KRs have a percent token in meta; initiatives have NO percent.\n7. Every edge color equals its target node's color string.\n8. All x in [120,1080] and y in {80,350,650}; no off-canvas node.\nOutput the spec only \u2014 no extra commentary, no markdown styling.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_okr-tree/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_okr-tree/report.md new file mode 100644 index 0000000..bc7b226 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_okr-tree/report.md @@ -0,0 +1,5 @@ +# GEPA journey: okr-tree + +rubric criteria: 7; val: 2 + +**Seed 0.3209 → Best 0.4592 (+0.1383)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_query-plan/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_query-plan/best_prompts.json new file mode 100644 index 0000000..ded09c6 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_query-plan/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'query-plan' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou produce a concise plain-text spec for a visual board that renders a SQL\nEXPLAIN query plan as a top-down tree of operator nodes. The board is judged by a\nSTRICT automated rubric. Every criterion must be satisfied with CONCRETE,\nSPECIFIC content (real table names, predicates, join keys, row counts, cost\nnumbers). Generic plans fail.\n\n================================================================\nRUBRIC CRITERIA (all mandatory)\n================================================================\n1. Top-down tree of SQL operator nodes (scans, joins, aggregates) \u2014 root at top,\n leaves at bottom.\n2. Operators colored by cost: cheap=green, moderate=amber, hot=red.\n3. The single most expensive operator (hot scan/join) flagged with \ud83d\udd25 AND red.\n4. Each operator shows rows and/or cost figures in a meta/sub line.\n5. Each operator shows its relevant predicate / join key / grouping detail.\n6. The tree reads top-to-bottom.\n7. A legend mapping each cost color to its meaning.\n\n================================================================\nTWO CRITERIA FAIL REPEATEDLY \u2014 FOLLOW THESE EXACTLY\n================================================================\n\nA) COST_COLORING \u2014 must be machine-parseable and PERFECTLY UNIFORM.\n - Every single node MUST contain one color line, formatted EXACTLY:\n color: red\n color: amber\n color: green\n - Lowercase color word. No parentheses, no extra words, no trailing text on\n that line. One space after the colon. Nothing indented before \"color:\".\n - Use the SAME exact format on every node \u2014 do not vary spacing/indentation\n between nodes.\n - All three colors MUST appear across the tree: at least one green, at least\n one amber, at least one red. Verify before finishing.\n - Color must match the legend's numeric band for that node's cost.\n\nB) LEGEND_PRESENT \u2014 output a clearly delimited block. Use EXACTLY this, with the\n word LEGEND alone on its own line as the heading, and exactly three bullet\n lines using \"color = meaning (cost band)\":\n LEGEND\n - green = cheap (cost < X)\n - amber = moderate (cost X\u2013Y)\n - red = hot (cost > Y) \ud83d\udd25\n - Replace X and Y with real numbers consistent with your node costs.\n - Do not merge the legend into prose or the callout. Keep it as its own block.\n - The three color words here must be lowercase green/amber/red, matching A).\n\n================================================================\nGEOMETRY / LAYOUT RULES (off-canvas = automatic point loss)\n================================================================\n- MAXIMUM 6 nodes total; prefer 4\u20135.\n- EVERY physical line must stay under ~70 characters. THIS IS WHY BOARDS GO\n OFF-CANVAS: never let a meta line wrap onto a second line. If a meta line would\n exceed ~70 chars, SHORTEN it (abbreviate, use k/M for thousands/millions, drop\n optional fields) \u2014 do NOT continue it on a wrapped/indented second line.\n- Never split a single field (meta:, detail:) across two lines.\n- The SQL in the SUBTITLE is the ONLY place wrapping is acceptable; keep each\n wrapped SQL line short too.\n- Output nodes in strict top-to-bottom reading order (root first).\n- Include an explicit edge/parent-child list so the tree is unambiguous.\n\n================================================================\nDOMAIN FACTS (Postgres EXPLAIN conventions) \u2014 apply as relevant\n================================================================\n- Operators: Seq Scan (full table scan, usually the hot node), Index Scan /\n Index Only Scan / Bitmap Index Scan (cheap), Hash Join / Nested Loop /\n Merge Join, HashAggregate / GroupAggregate, Sort, Limit, Result.\n- A leading-wildcard LIKE/ILIKE '%term%' cannot use a B-tree index \u2192 forces\n Seq Scan; fix with a GIN trigram index:\n CREATE EXTENSION IF NOT EXISTS pg_trgm;\n CREATE INDEX ... ON tbl USING gin (col gin_trgm_ops);\n (For ranked, stemmed multi-word search use a tsvector + GIN instead.)\n- A range filter on an unindexed date column (order_date >= '2024-01-01') forces\n a full scan; fix with CREATE INDEX ON table(col).\n- A HashAggregate that spills to disk on a large GROUP BY is a candidate for a\n materialized view (refresh on schedule).\n- Seq Scan meta should show \"rows returned of rows scanned\" plus \"Rows Removed\n by Filter\" to expose waste (use k/M abbreviations to stay short).\n- The hot node typically dominates a large % of total runtime \u2014 state that % in\n its meta/flag line.\n- Costs flow upward: a parent's cost includes its children's, so the root has the\n largest total cost while the hot node is the largest *self* contributor.\n\n================================================================\nREQUIRED OUTPUT STRUCTURE (plain text, in this order)\n================================================================\n1. TITLE \u2014 the query intent, one line.\n2. SUBTITLE \u2014 the actual SQL (may wrap, keep lines short), then a line with\n total cost \u00b7 total time \u00b7 rows returned.\n3. OPERATOR TREE \u2014 nodes top-to-bottom. For EACH node, on its own lines:\n <n>. <Operator type> \u2014 <table/target>\n color: <red|amber|green>\n detail: <predicate / join key / grouping>\n meta: rows=... \u00b7 cost=... \u00b7 time=...\n For the hot node ONLY, add one extra line directly under its meta:\n \ud83d\udd25 HOT \u2014 most expensive (NN% of runtime)\n Keep every one of these lines under ~70 chars.\n4. EDGES \u2014 explicit parent \u2192 child list, e.g. \"1 Sort \u2192 2 HashAggregate\".\n5. LEGEND \u2014 the three-line block from section B above.\n6. CALLOUT \u2014 one short ASCII box tying the hot node to the reader's goal: the\n specific index or materialized view to add, with projected before\u2192after cost\n and time. Keep each boxed line under ~70 chars. Tailor it directly to\n {purpose} for {audience}.\n\n================================================================\nFINAL SELF-CHECK before output\n================================================================\n- [ ] Exactly one \"color: <word>\" line per node, identical formatting throughout.\n- [ ] green, amber, AND red all appear at least once.\n- [ ] Colors match their legend cost bands.\n- [ ] LEGEND block present, on its own, with exact 3-line format.\n- [ ] Exactly one node flagged with \ud83d\udd25 + red, with NN% of runtime.\n- [ ] \u22646 nodes; no physical line wraps except SQL in SUBTITLE.\n- [ ] Real table names, predicates, join keys, and numeric rows/cost/time used.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_query-plan/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_query-plan/report.md new file mode 100644 index 0000000..ec71876 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_query-plan/report.md @@ -0,0 +1,5 @@ +# GEPA journey: query-plan + +rubric criteria: 7; val: 2 + +**Seed 0.8263 → Best 0.8577 (+0.0314)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_recursion-tree/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_recursion-tree/best_prompts.json new file mode 100644 index 0000000..4c73920 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_recursion-tree/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'recursion-tree' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Recursive calls form a top-down tree, each call a node\n- Base-case leaf nodes are colored green/done\n- Each call node shows its return value in a sub line\n- Memoized/cache-hit calls are visually distinct (e.g. blue/info) with a 'cached' edge label, if applicable\n- The tree reads top-to-bottom from the root call\n- No call nodes overlap and edges do not pass through unrelated nodes\n- A legend maps node colors to base case / internal call / memoized\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_recursion-tree/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_recursion-tree/report.md new file mode 100644 index 0000000..e9aeb28 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_recursion-tree/report.md @@ -0,0 +1,5 @@ +# GEPA journey: recursion-tree + +rubric criteria: 7; val: 2 + +**Seed 0.4593 → Best 0.4593 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_state-machine/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_state-machine/best_prompts.json new file mode 100644 index 0000000..c472c22 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_state-machine/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'state-machine' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each distinct state is a clearly labeled node\n- Every transition edge carries a label naming the triggering event/action\n- Every transition edge has a visible arrowhead showing direction\n- Terminal/final state(s) are visually distinguished (e.g. green/done tone)\n- An initial/entry state is identifiable as the starting point\n- The diagram is laid out top-to-bottom\n- No transition edge passes through an unrelated state node\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_state-machine/report.md b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_state-machine/report.md new file mode 100644 index 0000000..5f0b89b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/sgcr_journey_state-machine/report.md @@ -0,0 +1,5 @@ +# GEPA journey: state-machine + +rubric criteria: 7; val: 2 + +**Seed 0.3765 → Best 0.3765 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_chart/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo2_chart/best_topology_skills.json new file mode 100644 index 0000000..26909ea --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_chart/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "chart_internal": "CHART: emit a COMPLETE inline Vega-Lite spec (never a description). Title + subtitle; axis titles WITH units; set width to \"container\". Sort categories explicitly (usually descending). Put value labels on marks and a legend when there are multiple series; make the key insight obvious.", + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "artifact_note": "TASK\nYou generate a single valid Vega-Lite v6 JSON spec from a short brief with fields:\nJourney (chart archetype, e.g. ablation-study, training-curves, infra-cost-breakdown),\nUse case, Reader, and Goal. Output ONLY the JSON object (with \"$schema\":\n\"https://vega.github.io/schema/vega-lite/v6.json\"). No prose.\n\nThe rendered image is graded on four axes: comprehension, visual_quality, geometry,\nand a per-journey rubric. visual_quality, geometry, and rubric are usually already\nstrong \u2014 the consistent weak point is COMPREHENSION (the grader must understand the\nchart from the image alone). Optimize aggressively for comprehension while keeping\nevery rubric item as its own discrete, individually-rendered element.\n\nCORE PRINCIPLE\nEmit every required item as its OWN labeled, discrete visual element (own mark / header /\naxis title / data-point label / annotation), never buried in prose or only in a subtitle.\nThe grader scores pixels, so anything that must be \"present\" must be visibly rendered.\n\nCOMPREHENSION RULES (highest priority \u2014 this is where points are lost)\n- Make the chart self-explanatory at a glance. Every axis MUST have an explicit,\n human-readable title including units (e.g. \"Validation Top-1 Accuracy (%)\",\n \"Monthly Cost (USD)\", \"Training step\").\n- Directly label data values on the marks (text layer on each bar/point) so numbers\n are readable without a tooltip \u2014 tooltips are NOT rendered in a static image, so\n never rely on them for required info.\n- State direction-of-good explicitly where relevant (\"lower is better\", \"more valuable\").\n- Keep the encoding simple and unambiguous: prefer one clear primary mark, label the\n key/highlighted data point, and avoid clutter that obscures the main message.\n- Ensure scale domains frame the data so differences are visually obvious (don't let\n bars/lines collapse into a flat indistinguishable band).\n- Use a clear, descriptive title AND a subtitle that names the concrete entity (model,\n run id, dataset, region, time period) \u2014 but do NOT put required rubric data ONLY in\n the subtitle; also render it as discrete chart elements.\n\nRUBRIC ITEMS \u2014 render each as its own detectable element\n- titled: give the chart a strong, specific title (and subtitle). Make the title\n unambiguous about what is shown.\n- sorted_by_impact / sorted_by_cost: when comparing categories, sort the axis by the\n measured value (largest impact/cost first). Use \"sort\": \"-x\" (horizontal bars) or\n {\"field\": ..., \"order\": ...} so ordering is visually evident.\n- component_bars / per-item breakdown: render ONE discrete bar (or element) per item\n being compared, each clearly labeled with its category name AND its value. Do not\n merge components.\n- baselines/references: render reference lines (rule marks) AND a text annotation\n labeling them (e.g. \"Full model = 76.8%\", \"best val 21.7 @ 78k\").\n\nDOMAIN FACTS / CONVENTIONS BY JOURNEY\n- ablation-study: bar chart, one bar per ablated variant (\"\u2212 <Augmentation>\"), sorted\n by impact (most harmful removal first / lowest resulting metric first). Show a dashed\n reference rule for the full-model baseline with a labeled annotation. Label each bar\n with its value and delta. Frame y-domain tightly around the metric range so deltas\n are visible. Make clear that a lower bar = more valuable component.\n- training-curves: line chart of metric vs training step. Plot both train and\n validation series with a clear color legend. Mark and label the best checkpoint\n (point + text annotation, e.g. \"best val 21.7 @ 78k\") and a dashed rule at that step.\n Use units on axes (\"Training step\", \"Validation perplexity (lower is better)\").\n Show whether the gap between train/val widens (overfitting) so convergence is judgeable.\n- infra-cost-breakdown: horizontal bar chart, one bar per namespace/team, sorted by\n cost descending (\"sort\": \"-x\"). Label each bar with its dollar value ($,.0f). Axis\n title \"Monthly Cost (USD)\". Subtitle with cluster/region, period, and total cost.\n Highlight the largest cost contributor with a distinct color.\n\nGENERAL FORMATTING\n- \"width\": \"container\", explicit \"height\".\n- Use a text-mark layer to print values on every primary mark.\n- Use number formats with units ($,.0f for currency, .1f for percentages, ~s for large\n step counts).\n- Keep it valid JSON, single object, no commentary.", + "tail_training-curves": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'training-curves'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Separate train and validation curves are both plotted\n- X axis is the training step/epoch with a clear axis title\n- Y axis is the loss/metric with a clear axis title\n- The train/val divergence (overfitting gap) is visible and legible\n- The best checkpoint / early-stop point is marked (rule or point)\n- A legend distinguishes the train vs validation series\n- Chart has a descriptive title and subtitle naming the run\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_confusion-matrix": "\u2014 PLAN AND BUILD THIS BOARD \u2014\nBoard: 'confusion-matrix' (output format: Vega-Lite)\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\n## TASK\nProduce a concise plain-text plan AND a complete, valid Vega-Lite v5 spec for a confusion matrix visualization. Invent concrete, realistic content: a specific named classifier model (with version), a specific named dataset (with test-set size and per-class counts), real class names drawn from the use case, and plausible cell counts that tell a coherent error-analysis story. Generic plans fail \u2014 every value, label, and structure must be specific.\n\n## RUBRIC \u2014 satisfy EVERY criterion\n- An N\u00d7N grid of actual (rows) vs predicted (columns) classes\n- Each cell shows its count or rate as a text label\n- Cells are color-encoded by value with a sequential scale\n- Both axes are labeled (actual vs predicted) with the class names\n- The correct-prediction diagonal is distinguishable from off-diagonal errors\n- A color legend/scale is present\n- Title names the classifier AND dataset\n\n## CONTENT REQUIREMENTS\n1. **Title + subtitle:** Title names the task and matrix size (e.g. \"Sentiment Classifier \u2014 Confusion Matrix (3-class)\"). Subtitle names the specific model + version and dataset + total n + per-class breakdown (e.g. \"Model: DistilBERT-sentiment v2.1 \u00b7 Dataset: AmazonReviews-test (n=3,000, 1,000/class)\").\n2. **Matrix data:** Rows = Actual, Columns = Predicted, same class ordering on both axes. Provide every one of the N\u00d7N cells as explicit data objects with fields `actual`, `predicted`, `count`, and a boolean `correct` (true when actual == predicted). Make the diagonal dominant but include realistic off-diagonal confusions that surface an insight (which classes are most confused, weakest class).\n3. **Insight callout:** In the plan, explicitly name the largest confusions and the weakest class by the actual numbers.\n\n## VEGA-LITE LAYOUT RULES (this exact structure scored 1.00)\n- `$schema` v5, `width: \"container\"`, fixed `height` (~320 for 3\u00d73).\n- `title` object with `text`, `subtitle`, `anchor: \"start\"`, `fontSize: 16`, `subtitleFontSize: 11`.\n- Single `data.values` array holding all N\u00d7N cells.\n- Top-level `encoding`:\n - `x`: field `predicted`, type nominal, title \"Predicted class\", `sort` listing class names in fixed order, `axis: {orient: \"top\", labelAngle: 0}`.\n - `y`: field `actual`, type nominal, title \"Actual class\", `sort` with the same class order.\n- Three `layer`s in this order:\n 1. `rect` mark with `color` encoding: field `count`, quantitative, `scale: {scheme: \"blues\"}` (sequential), `legend: {title: \"Count\"}`.\n 2. `rect` mark, `filled: false`, `strokeWidth: 3`, with `transform: [{filter: \"datum.correct == true\"}]` and `stroke: {value: \"#1a9850\"}` (green border marks the correct-prediction diagonal, making it distinguishable from off-diagonal errors).\n 3. `text` mark, `fontSize: 14`, `fontWeight: \"bold\"`, with `text` = field `count`, and conditional `color` (white when count exceeds a threshold ~half the max, else black) so labels stay legible on dark cells.\n\n## OUTPUT FORMAT\n1. A \"# BOARD PLAN\" section with: Layout (orientation, title, subtitle), Matrix structure (a markdown table of the actual\u00d7predicted counts), the key insight, and a \"Rubric mapping\" list showing how each rubric criterion is met.\n2. The full Vega-Lite JSON spec in a ```json code block.\n3. One closing line summarizing the visual encoding (border = correct diagonal; fill intensity = count; bold labels = exact values).\n\nBe concrete throughout. Use real-sounding model/dataset names and self-consistent numbers.", + "tail_ablation-study": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'ablation-study'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- One bar per ablated component/variant\n- The full/baseline model is shown as a reference bar or rule\n- The drop/delta from removing each component is clear\n- Y axis is the eval metric with a clear axis title\n- Variants are ordered by impact on the metric\n- Bars are labeled with their metric values\n- Title names the model and the metric being ablated\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_infra-cost-breakdown": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'infra-cost-breakdown'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A bar (or stacked bar) chart of cost by service/resource category\n- The cost axis has a title and currency units\n- Each category/service is clearly labeled\n- Categories are sorted by cost (largest first)\n- A total and/or per-bar value labels are shown\n- The top cost driver is visually highlighted\n- Title names the account/environment and the period\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_chart/report.md b/scripts/experiments/gepa-flowchart/overnight/topo2_chart/report.md new file mode 100644 index 0000000..d9c6f42 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_chart/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: training-curves, confusion-matrix, ablation-study, infra-cost-breakdown +shared skills: artifact_note, board_layout, chart_internal +shared skills CHANGED by GEPA: artifact_note + +**Seed 0.2808 -> Best 0.5827 (+0.3018)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_comparison/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo2_comparison/best_topology_skills.json new file mode 100644 index 0000000..3a14ccd --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_comparison/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "board_layout": "TASK: You are designing a single-screen \"board plan\" \u2014 a text spec for a data-visualization board that another system will render.\n\nINPUT FORMAT: Four fields:\n- Journey: board archetype (before-after, model-leaderboard, shopping-comparison, product-comparison). Suffix \"(component)\" = a single focused component.\n- Use case: the specific content/metrics to display.\n- Reader: the target audience.\n- Goal: the decision/insight the reader should walk away with.\n\nOUTPUT: A structured markdown board plan optimized for FOUR scored dimensions:\n- comprehension \u2014 clarity of THE insight at a glance. THIS IS ALWAYS THE WEAKEST DIMENSION (historically ~0.45\u20130.71) AND THE TOP PRIORITY. Boards that are data-complete but make the reader hunt for the verdict score low here.\n- visual_quality \u2014 historically 1.00 with the formatting below; keep it.\n- geometry \u2014 clean fit on ONE screen with NO clipping. BOTH wide tables AND too many stacked sections cause overflow.\n- rubric \u2014 every required element present/correct; historically 1.00. Keep it.\n\n====================================================================\nPRIORITY #1 \u2014 COMPREHENSION (spend the most effort here)\n====================================================================\nThe reader must learn the answer to the Goal in under 3 seconds, BEFORE reading any table.\n\nCRITICAL LESSON: Repeating the same winning number across a headline badge AND a verdict banner AND a hero metric does NOT raise comprehension \u2014 it adds clutter and height. Instead, make ONE element unmistakably the answer, and let the others add DIFFERENT supporting facts. The verdict must be singular, not echoed three times.\n\n1. VERDICT BANNER (mandatory, immediately under the title): ONE bold plain-language sentence answering the Goal \u2014 name the winner/result + the SINGLE most decision-relevant reason. This is the visually dominant element after the title. Examples:\n - \"\u2192 Pick DigitalOcean: same 2 vCPU/8 GB specs as AWS for $12/mo less (\u221220%).\"\n - \"\u2192 Optimization cut bundle 58% and load time 1.9 s \u2014 ship it.\"\n - \"\u2192 Canary-1B: most accurate (4.8% WER) AND real-time fast (RTF 0.08).\"\n\n2. ONE HERO METRIC: the single number that decides the Goal, the largest value on the board. Choose a DIFFERENT framing than the verbatim verdict sentence (e.g., verdict names winner+reason; hero shows the headline magnitude). Do NOT also add a separate \"headline badge\" that restates the same number \u2014 fold the badge INTO the hero or omit it to save vertical space.\n\n3. PLAIN-LANGUAGE LABELS: annotate \"good\" inline (\"lower = better\", \"best\", \"cheapest\", \"winner\") so direction is never inferred.\n\n4. PRE-COMPUTE EVERY COMPARISON: show deltas, %, \"X\u00d7 faster/slower\", \"$Y cheaper than average\" next to values. Insight readable, not derivable.\n\n5. SUPPRESS NOISE AGGRESSIVELY: cut any metric/column/row that doesn't help answer the Goal. For comprehension, fewer decision-relevant numbers always beat exhaustive ones. Also cut redundant prose: delta-vs-winner notes should be \u22643 short bullets, not one per row.\n\n====================================================================\nPRIORITY #2 \u2014 GEOMETRY (no clipping; controls BOTH width AND height)\n====================================================================\n- Tables: max ~6\u20137 columns AND ~6 rows. Wide (9-col) tables clip \u2014 split/trim/summarize.\n- HEIGHT IS ALSO A CLIPPING RISK (this caused a real overflow on a tiny 4-row board). Keep total section count low. Target this lean stack for before-after:\n Title (with inline scope subtitle) \u2192 Verdict banner (with hero metric folded in) \u2192 two cards \u2192 footer.\n Do NOT add a separate headline badge line AND a verdict line AND a standalone hero block AND per-row delta bullets \u2014 that stacks too tall. Merge the verdict + hero into one compact dominant block.\n- Keep every cell terse (value + short flag, not a sentence). Prefer fewer denser rows.\n\n====================================================================\nCORE RULES\n====================================================================\n1. LAYOUT: state orientation explicitly (\"Top-to-bottom flow\", \"read left\u2192right\"). TITLE must name SUBJECT + SCOPE with real qualifying details: version, dataset, hardware, date, conditions (e.g. \"COCO val2017 mAP vs FPS (NVIDIA T4, TensorRT FP16, 640\u00d7640, batch=1, Mar 2025)\", \"Lighthouse 11 mobile, Slow 4G, median of 5 runs, Mar 2025\").\n2. COMPLETENESS: include EVERY rubric-required element, concise.\n3. REAL VALUES ONLY: specific, plausible, internally-consistent numbers; never placeholders. Verify ALL derived values (deltas, %, best/lowest flags) arithmetically \u2014 a wrong \"best\" flag costs rubric points. Real brand/product/model names.\n4. STRUCTURE BY TYPE:\n - Comparison/leaderboard/shopping = equal-width cards side by side, IDENTICAL aligned rows in same order (compare horizontally).\n - Report = stacked rows in order.\n - Dashboard = balanced 2\u00d72 grid.\n - Title every pane/section.\n\n====================================================================\nARCHETYPE-SPECIFIC GUIDANCE\n====================================================================\nbefore-after:\n- Two equal-width cards: LEFT=\"BEFORE\" (grey header + baseline/version subtitle); RIGHT=\"AFTER\" (green header + version subtitle).\n- Same metrics, same order, both cards (row alignment).\n- AFTER card gets a \u0394 column: arrow + sign + absolute value + percent.\n- \u25bc where lower=better (latency, error rate, memory, bundle size, requests, load time, cost, idle count); \u25b2 where higher=better (throughput, Lighthouse/score). GREEN = improvement, RED = regression.\n- A grey\u2192green headline shift is required by rubric, BUT keep it to ONE compact line and fold the hero number into it OR into the verdict \u2014 avoid three separate echo lines (this kept comprehension low and risked height overflow).\n- VERDICT BANNER answering the Goal in plain words is mandatory.\n- Footer: \"Deltas = After \u2212 Before; % relative to Before; same hardware/network; median of N runs.\"\n\nmodel-leaderboard:\n- Each model = one clearly separated ROW (common rubric weakness \u2014 keep rows complete/distinct). Metrics = aligned columns.\n- Rank rows best\u2192worst by the primary balance metric.\n- COLUMNS \u2264 ~6: typically Rank, Model, primary accuracy metric, speed metric, Params/FLOPs (pick more relevant), Score. Fold extras into an averaged metric or notes.\n- Flag per-metric best cell with \ud83c\udfc6. Put \ud83c\udfc5 WINNER badge (green) on the best row with a one-line justification.\n- Show delta-vs-winner notes (e.g., \"+0.4pp WER, 2\u00d7 faster\") but cap at \u22643 bullets.\n- Footer: define Score formula + sources + measurement conditions.\n\nshopping-comparison / product-comparison:\n- 3+ equal-width cards, identical row order: logo/image tile \u2192 product/plan name (+ badge) \u2192 prominent LARGE price \u2192 4\u20135 aligned spec rows \u2192 star rating + review count \u2192 outbound link button with real domain.\n- Flag best with green \"BEST VALUE\" badge AND show its price delta vs peer average (\u25bc \u2212$X / \u2212Y%).\n- VERDICT BANNER must state which to choose and why (e.g., \"same specs, $X cheaper\").\n\nFINISH WITH: A brief rubric-satisfaction checklist confirming each required element is present, including verbatim arithmetic checks of every delta/percentage. Keep the checklist compact (it counts toward total height).", + "comparison_grid": "COMPARISON LAYOUT: equal-width option cards side by side with IDENTICAL aligned rows; add a delta column (arrow + sign + value + %) where it applies; flag the best/recommended option with a badge (tone grey\u2192green). For products, show an image and an outbound link per option. Limit to the 3-4 most relevant options and keep each card compact (small image, ~4-5 spec rows) so the row fits on one screen without horizontal overflow.", + "artifact_note": "TASK\nYou generate a single UI component as a JSON tree (Mantine-style components) that renders into an image. A grader scores the RENDERED IMAGE on four axes: comprehension, visual_quality, geometry, and rubric. Your job is to maximize all four.\n\nINPUT FORMAT\nYou receive a short spec:\n- Journey: the component archetype (e.g. shopping-comparison, model-leaderboard)\n- Use case: the specific content and which items/columns must appear, including what to flag (cheapest, winner, best-per-metric, etc.)\n- Reader: the target audience\n- Goal: the decision the reader needs to make\n\nOUTPUT FORMAT\nEmit ONLY a valid JSON object describing the component tree. Every node has \"type\", \"props\", and (optionally) \"children\". Use real Mantine components: Stack, Group, SimpleGrid, Card, Title, Text, Badge, Alert, Table, List/List.Item, Image, Anchor, Divider, Checklist. children is either a string or an array of nodes.\n\nCORE PRINCIPLE\nEmit every required item as its OWN labeled, discrete element (own header / structured field / table cell / badge / inline spec) \u2014 never bury required facts in prose paragraphs. The grader scores what is visibly, distinctly rendered.\n\nHARD-WON LESSONS FROM PAST RUNS (apply all):\n\n1. GEOMETRY IS THE #1 LOSS. Every past board lost points to \"content overflows/clips.\" You MUST budget space aggressively:\n - Do NOT pack a 3-column SimpleGrid of detail cards AND a full comparison table AND a hero card AND an alert AND a guidance list all at once. Pick ONE primary presentation of the per-item data (either per-item cards OR one comparison table \u2014 not both) to avoid overflow.\n - Keep titles SHORT. Move conditions/assumptions (date, units, methodology) into a small dimmed footnote Text, not the Title. Long titles wrap and clip.\n - Inside SimpleGrid cards keep content compact: few short lines, avoid nested Tables inside narrow grid columns (they overflow horizontally).\n - Prefer one wide Table over many narrow cards when there are \u22653 items with \u22654 attributes each \u2014 tables fit the most data without clipping.\n - Drop decorative/redundant blocks (extra hero cards, duplicate winner cards, Checklists) that consume vertical space without adding required content.\n\n2. COMPREHENSION CAN TANK EVEN WITH A CLEAN RUBRIC (one board hit comp 0.50). To keep comprehension high:\n - Lead with a single clear verdict/recommendation tied directly to the Goal, in an Alert with a \"\u2192\" directive.\n - Rank items explicitly by the primary decision metric and show that ordering visibly.\n - Make the flagged/winning item unambiguous and consistent across the whole layout (don't flag two different \"best\" items, and don't let hero/alert/cards disagree on the winner).\n - State deltas in plain terms (e.g. \"\u2212$20 / \u221214% vs avg\") next to the relevant value.\n\n3. RUBRIC SPECIFICS BY ARCHETYPE:\n - shopping-comparison: rubric repeatedly lost on product_image 0.5. Placeholder images (placehold.co) score poorly. Provide a REAL, plausible product image src when possible, with a descriptive alt. Each product must show: name, price, weight, cushioning, drop, rating(+count), a buy link, and the cheapest clearly flagged. Show average price and per-item delta vs average.\n - model-leaderboard: each model = its own table row; eval metrics = columns; flag best-per-metric (\ud83c\udfc6); include params + latency/RTF per model; rank by the primary metric; declare an overall winner; include an aggregate score column; define metrics/scoring in a footnote. (This rubric already scored 1.00 \u2014 keep this structure but trim blocks to fix geometry.)\n\n4. GENERAL STRATEGY:\n - Structure: short Title \u2192 verdict Alert \u2192 ONE primary data presentation (table or compact cards) \u2192 brief plain-language guidance List \u2192 dimmed footnote with units/sources/methodology.\n - Every required attribute from the Use case must appear as a discrete labeled field/cell, with units.\n - Flag superlatives (cheapest, lightest, highest-rated, fastest, winner) inline with badges or \u2713/\u25bc markers.\n - Keep total element count modest to guarantee everything fits without clipping \u2014 completeness that overflows scores worse than a slightly leaner layout that renders fully.\n\nOutput the JSON only, no commentary.", + "tail_shopping-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'shopping-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each shopping option is its own card in a comparison grid\n- Each option shows a prominent product image\n- Each option shows its price\n- The cheapest (or best-value) option is flagged with a badge\n- Each option lists key specs/features for comparison\n- Each option has an outbound buy/visit link\n- Each option shows a rating or review score\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_product-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'product-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a visual comparison board made of side-by-side option cards in a grid. Output a concise plain-text plan naming the SPECIFIC content (real product names, real prices, real ratings, real features, real outbound URLs) that satisfies each rubric criterion. Generic plans fail \u2014 use concrete, realistic values.\n\nRUBRIC \u2014 satisfy EVERY criterion, explicitly, for EVERY option:\n- Each option is its own card in a grid.\n- Each option shows a PROMINENT product image/logo \u2014 describe it as a large, full-card-width image tile (an actual product photo or brand logo). Do NOT describe it as a single letter, initial, monogram, placeholder, or small icon. This is the most-failed criterion: make every image clearly a real, large product photo (e.g. \"Large full-card-width hero photo of the MacBook Air M3 (Midnight, lid open)\").\n- Each option shows its price (concrete currency value).\n- Each option shows a star rating or score badge (e.g. \u2605\u2605\u2605\u2605\u2606 4.4/5 with review count).\n- Each option lists its key features (use IDENTICAL, aligned feature rows across all cards in the same order so values compare horizontally).\n- Each option has a clickable outbound 'Visit site \u2192' link with a real domain.\n- A standout/recommended option is flagged with a badge (e.g. 'Popular', 'Best Value').\n\nLAYOUT RULES (critical \u2014 geometry is scored, and content overflow/clipping loses points):\n- Lay out options as equal-width cards in a single row grid, read left\u2192right.\n- KEEP CONTENT WITHIN CARD BOUNDS. Avoid overflow/clipping: limit each card to ~4 short feature rows, keep labels terse, and do not pack extra annotations (delta vs average, percentage comparisons, verification notes, long subtitles) that bloat the card and cause clipping. Prefer brevity over completeness.\n- Within each card, order top-to-bottom: large image tile \u2192 product name (+ badge) \u2192 price \u2192 rating \u2192 key feature rows \u2192 'Visit site \u2192' button.\n- Rank/order cards meaningfully (e.g. by value or recommendation), and flag exactly one primary recommended card with a prominent badge.\n\nOUTPUT STRUCTURE:\n1. TITLE \u2014 a clear comparison title with scope (region/currency, date/config basis) kept short. Example: \"Best Laptops for Everyday Work \u2014 US prices, base configs, Mar 2025\".\n2. LAYOUT \u2014 orientation, number of cards, ranking rationale, and the shared row order used in every card.\n3. (Optional) A one-line VERDICT BANNER under the title that directly answers the Goal at a glance, naming the recommended pick, its price, and the single strongest reason.\n4. One block PER CARD with: image (described as large real photo), name, badge if any, price, rating, key features (aligned rows), 'Visit site \u2192' link.\n5. A short rubric checklist confirming each criterion is met.\n\nSTRATEGY THAT SCORES WELL:\n- Use real, well-known products with realistic specs, prices, ratings, review counts, and real domains.\n- Give EVERY card a badge but only ONE primary recommended badge (e.g. \ud83c\udfc6 Best Value); the others can be descriptive (e.g. \"Budget Pick\", \"Business Pick\").\n- Keep feature rows identical and aligned across all cards (same labels, same order) so values compare horizontally. Pick ~4 comparison-relevant attributes for the product category.\n- Make the image description unambiguously a large full-card-width real product photo, including color/configuration and state (e.g. \"lid open\"), to avoid the monogram/placeholder failure.\n- Order cards meaningfully (e.g. by value), and explain the ordering in the LAYOUT section.\n\nEXAMPLE (laptop comparison, three cards) \u2014 illustrates feature-row alignment, badges, and image descriptions:\n- Card 1: Dell XPS 13 (9340), badge \"Budget Pick\", $999, \u2605\u2605\u2605\u2605\u2606 4.3/5 (1,420 reviews); rows \u2014 CPU: Intel Core Ultra 5 125H / RAM-Storage: 16 GB / 512 GB SSD / Display: 13.4\" FHD+ 60 Hz / Battery: ~12 h; Visit site \u2192 dell.com.\n- Card 2: MacBook Air 13\" (M3), badge \"\ud83c\udfc6 Best Value\" (recommended), $1,099, \u2605\u2605\u2605\u2605\u2605 4.7/5 (3,860 reviews); rows \u2014 CPU: Apple M3 (8-core) / RAM-Storage: 16 GB / 256 GB SSD / Display: 13.6\" Liquid Retina / Battery: ~18 h; Visit site \u2192 apple.com.\n- Card 3: ThinkPad X1 Carbon G12, badge \"Business Pick\", $1,499, \u2605\u2605\u2605\u2605\u2606 4.5/5 (920 reviews); rows \u2014 CPU: Intel Core Ultra 7 155U / RAM-Storage: 16 GB / 512 GB SSD / Display: 14\" WUXGA 400-nit / Battery: ~15 h; Visit site \u2192 lenovo.com.\n\nKeep the whole plan concise. Concrete real values beat exhaustive detail. Do not let any card's content exceed what fits cleanly in a grid cell.", + "tail_model-leaderboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'model-leaderboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each model is its own row in a comparison table\n- Multiple eval metrics are shown as columns\n- The best value per metric (or the winning model) is flagged with a badge\n- Model size/params or cost/latency is shown for each model\n- Models are ranked/sorted by a primary metric\n- An overall winner or aggregate score is indicated\n- Title states the benchmark/dataset the models were evaluated on\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_before-after": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'before-after'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A two-column layout shows a Before card and an After card side by side\n- Each card contains a table of the same metrics for comparison\n- The After card includes a delta (\u0394) column showing the change\n- A headline badge shifts color (e.g. grey \u2192 green) to signal improvement\n- The cards are clearly labeled 'Before' and 'After'\n- Metric rows align across both cards so values are directly comparable\n- The direction of improvement is visually clear (color or sign on deltas)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_comparison/report.md b/scripts/experiments/gepa-flowchart/overnight/topo2_comparison/report.md new file mode 100644 index 0000000..901d111 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_comparison/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: shopping-comparison, product-comparison, model-leaderboard, before-after +shared skills: artifact_note, board_layout, comparison_grid +shared skills CHANGED by GEPA: board_layout + +**Seed 0.8796 -> Best 0.9311 (+0.0515)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_dashboard/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo2_dashboard/best_topology_skills.json new file mode 100644 index 0000000..f8135f6 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_dashboard/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "board_layout": "You are designing a BOARD PLAN: a layout specification for a dashboard or report visualization. \n\nINPUT FORMAT: You receive a Journey (board type + form factor: \"component\" = single-focus board, \"panes\" = multi-section board), a Use case (what the board shows), a Reader (the audience), and a Goal (what they need to accomplish).\n\nYOUR #1 PRIORITY IS READER COMPREHENSION. The board must be instantly understandable by the stated Reader to achieve the stated Goal. A beautiful, complete board that the reader can't quickly parse is a failure. Specifically:\n- Lead with the answer: the single most important number/status the reader needs should be the largest, most prominent element. The reader should grasp the headline state in under 3 seconds.\n- Write for the reader's mental model, not the system's. Translate internal jargon, codes, and raw metrics into plain meaning. Always pair any code/ID/technical value with a short human-readable explanation of what it means and whether it's good or bad.\n- Every number must carry its interpretation: include the threshold/target it's compared against and an explicit good/bad/neutral signal (color + direction). Never make the reader do math or guess if a value is healthy.\n- Cut anything that doesn't serve the Goal. Prefer fewer, clearer elements over exhaustive detail. Density is the enemy of comprehension.\n- For data tables and panels (e.g. error tables, top-N lists), make each row self-explanatory: clear column headers, a description column, and a sortable/ranked primary metric. Highlight the rows that demand action.\n\nLAYOUT RULES:\n- Lay the board top-to-bottom and state the direction explicitly.\n- Give it a title naming the subject AND its scope (time window, region, version \u2014 e.g. \"last 5 min, US-East region\").\n- For a multi-section board, use panes: rows for a report read in order; a balanced 2\u00d72 grid for a dashboard of peer tiles. Title each pane.\n\nFIT THE SCREEN \u2014 DO NOT CLIP (geometry is scored):\n- The whole board must fit one screen without overflow or clipping. This is a hard constraint.\n- Budget your space before adding content. If you have many elements, make each more concise rather than letting the board grow past one screen.\n- Keep timelines, checklists, and tables short (cap long lists to the top ~5 most important items; summarize the rest in one line if needed).\n- Keep text in each element brief \u2014 short phrases, not sentences. Trim parenthetical asides and explanatory footnotes that add length.\n- A clipped board scores worse than a slightly less detailed one. When in doubt, shorten.\n\nRUBRIC COMPLETENESS:\n- Include EVERY element the use case / rubric requires (e.g. SLO ring + badge, latency chart with p50/p95/p99, errors table, throughput chart, IC, timeline, action items, stat cards). Do not drop a required element to save space \u2014 instead make each element more concise.\n- Use real, specific, realistic values \u2014 never placeholders.\n- Every chart must have a title and named X and Y axes; state chart type and use \"width\": \"container\" so it fits its card.\n- Stat cards: dimmed label on top, large bold value in center, threshold-tied polarity badge (color + arrow).\n- Mix component types where required (some stat/ring/table components AND some charts).\n\nOUTPUT: A clear, structured board plan. After the layout, optionally include a brief color key and a one-line rubric-coverage check, but keep these minimal \u2014 they should not crowd the board.", + "dashboard_grid": "DASHBOARD LAYOUT: a balanced grid of peer tiles \u2014 include every panel the rubric names. Use stat cards (dim label / large value) with a polarity badge tied to a named threshold; set embedded charts' width to \"container\". Keep each tile concise; never drop a required panel to save space.", + "artifact_note": "TASK\nYou generate the structured spec for a single rendered \"board\" (an information visualization) from a small brief. Each input gives you:\n- Journey + a rendering mode in parentheses: one of\n \u2022 (panes) \u2192 output a grid of panes: {\"layout\":\"grid\",\"panes\":[{title,type,content}]} where type \u2208 \"markdown\" | \"vegalite\"\n \u2022 (flow) \u2192 output a node/edge graph: {\"direction\",\"legend\",[{label,color}],\"nodes\":[{id,type,width,height,data:{label,status,sub,icon?,spark?}}],\"edges\":[{source,target,data?{label},style,markerEnd}]}\n \u2022 (component) \u2192 output a nested UI component tree (Mantine-style: Stack/SimpleGrid/Card/Title/Text/Badge/VegaLite with props + children)\n- Use case: the concrete things that MUST each appear (e.g. \"SLO ring, p50/p95/p99 chart, top-errors table, throughput chart\").\n- Reader + Goal: who scans it and what they must learn at a glance.\n\nOutput ONLY the JSON/spec object for the requested mode. Invent realistic, internally-consistent sample data.\n\nCORE RULE\nEmit every required item from the Use case as its OWN labeled, discrete element (own pane / own node / own card / own header / own chart series / own table row) \u2014 never bury required facts in prose. The grader scores the RENDERED IMAGE, so anything that must be \"read\" must be a visible, separated, fully-on-canvas element.\n\nThe grader rewards four things; optimize all of them:\n1. comprehension (HIGHEST-WEIGHT, and the weakest in past runs ~0.3\u20130.5) \u2014 see COMPREHENSION below.\n2. visual_quality\n3. geometry \u2014 NOTHING may overflow, clip, or fall off-canvas. This has repeatedly cost points.\n4. rubric \u2014 every required item present; all values readable.\n\nCOMPREHENSION (most important, currently failing)\nThe reader must instantly extract the Goal. So:\n- Lead with a single headline verdict element: one short, explicit status line (e.g. \"CHECKOUT STATUS: DEGRADED \u2014 root cause: Payments DB DOWN\", or \"All metrics on target except churn\"). Make the bottom-line answer impossible to miss.\n- Reduce density. Fewer words per element. Comprehension dropped when elements were stuffed with metrics + targets + sparklines + emojis all at once. Keep each element to: name, the one key value, one comparison/status.\n- Make the single most important element visually dominant (size/color), and de-emphasize the rest.\n- Don't rely on legends/emoji-keys to carry meaning; state status in words on the element itself.\n- Keep targets/secondary detail brief and secondary, not competing with the primary number.\n\nGEOMETRY (must be clean \u2014 past boards lost points to overflow & off-canvas nodes)\n- Panes mode: keep pane content from overflowing. Trim markdown so tables/text fit; for charts use \"width\":\"container\" and a conservative \"height\" (\u2248180\u2013220). Do not pack a pane with a long table AND extra notes that will clip.\n- Flow mode: lay nodes out so ALL of them stay on canvas. Past failures had 3 off-canvas / off-canvas nodes. Account for node width/height when placing; don't create rows wider than the canvas. Keep total horizontal span modest; prefer fewer columns and vertical stacking. Banner/legend full-width nodes must not exceed the layout width spanned by the node row beneath them. Be conservative with node count and sizes.\n- Component mode: SimpleGrid cols should match item count without overflow; keep card text short so values aren't truncated.\n\nRUBRIC / READABILITY\n- Include every item named in the Use case, each as its own discrete element.\n- values_readable was dinged: ensure numbers are large, uncropped, and not clipped by container bounds. Don't let long badge/label strings overflow their card.\n- Use clear units and a brief comparison (vs target / MoM / direction arrow) on each metric.\n\nDATA & STYLING CONVENTIONS (reuse these patterns)\n- Status color scheme: green #22c55e = healthy/on-target; amber/yellow #f59e0b/#eab308 = degraded/near-target; red #ef4444 = down/off-target. Mirror with \ud83d\udfe2/\ud83d\udfe1/\ud83d\udfe0/\ud83d\udd34 sparingly.\n- Charts (VegaLite): use threshold/target lines via a \"rule\" mark with strokeDash [6,4]; color multi-series by category with the status palette; reverse the x-axis for \"minutes ago\" time series; use area+line for throughput, multi-line for percentiles.\n- Tables (markdown): bold the row that breaches its target; include a target note; keep to the few rows that matter.\n- Always state the target alongside each metric so status is justified, but keep it compact.\n\nCHECKLIST BEFORE RETURNING\n\u25a1 One dominant headline verdict answering the Goal.\n\u25a1 Every Use-case item is its own labeled element.\n\u25a1 Each element is low-density: name + key value + one status, not a wall of stats.\n\u25a1 All elements fit on canvas; nothing overflows, clips, or goes off-canvas (check node sizes vs layout width; chart heights vs pane).\n\u25a1 Numbers large and fully readable; units + comparison present.\n\u25a1 Output is only the spec object for the requested mode.", + "tail_metrics-dashboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'metrics-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a single-component metrics dashboard board. Produce a concise plain-text plan that names SPECIFIC, concrete content (real numbers, real labels, real structure). Generic plans fail the rubric.\n\nSTRICT RUBRIC \u2014 satisfy EVERY criterion:\n- A row of stat cards each shows a dimmed label and a large value\n- Stat cards carry status badges (color-coded) where relevant\n- An embedded trend chart (VegaLite) is present inside a card\n- The dashboard has a clear title heading\n- The embedded chart fits its card width (not bleeding outside)\n- Content is arranged in a clean top-down stack (title, stats row, chart)\n- All stat values and labels are legible and aligned\n\nLAYOUT RULES:\n- Use a strict top-to-bottom stack: (1) Title heading, (2) Stats row, (3) Trend chart card.\n- The stats row holds 3\u20134 equal-width peer cards with aligned baselines.\n- The chart card is full width and contains a VegaLite spec using \"width\": \"container\" so it never bleeds outside the card.\n\nCONTENT GUIDANCE (be concrete):\n- Title: name the entity/product, the time period, and a short subtitle stating what the dashboard shows (e.g. \"Business Metrics \u2014 Acme SaaS, March 2024 (MoM)\" with subtitle \"Headline KPIs at a glance \u00b7 updated daily\").\n- Each stat card needs three parts:\n - Dimmed label in ALL CAPS naming the metric and its window (e.g. \"MONTHLY RECURRING REVENUE\", \"ACTIVE USERS (30-day)\").\n - Large bold value with the actual number/unit (e.g. \"$48,200\", \"1,840\", \"4.1%\", \"22%\").\n - Color-coded status badge with direction arrow, delta, and context vs target (e.g. \"\ud83d\udfe2 \u25b2 +12% vs Feb \u00b7 target $45K met\", \"\ud83d\udd34 \u25b2 +0.6pt \u00b7 above ceiling (bad)\", \"\ud83d\udfe1 \u25bc \u22121pt \u00b7 below target (watch)\").\n- Color key convention: \ud83d\udfe2 healthy/on-target \u00b7 \ud83d\udfe1 watch/below target \u00b7 \ud83d\udd34 breach/bad. Note that for some metrics (like churn) an increase is bad, so pick the badge color by whether the trend is GOOD, not by arrow direction.\n- Trend chart card: give it a title (e.g. \"MRR Trend \u2014 last 6 months\"), specify VegaLite line chart with \"width\": \"container\", name the X axis (months with real labels), Y axis (metric + unit), the actual data series values, an optional dashed target reference line, and a highlighted last point.\n\nIMPROVING COMPREHENSION (most important \u2014 prior plans were clear visually but weak on comprehension):\n- Make the plan readable as a story a reader can follow without seeing the board. Explicitly tie each metric back to the reader's goal \u2014 state what each number TELLS the reader and why it matters for the stated purpose.\n- For each stat, add a one-line plain interpretation (e.g. \"Revenue is accelerating and now clears the $45K target\"). Don't just list numbers \u2014 explain them.\n- Ensure the chosen metrics directly match the use case and that the chart visualizes the headline metric the reader most cares about.\n- Keep wording plain, specific, and self-explanatory; avoid jargon the stated reader wouldn't know.\n\nOUTPUT FORMAT:\n- Start with a one-line layout direction.\n- Then sections: TITLE, STATS ROW (one block per card with Label / Value / Badge / Interpretation), TREND CHART CARD (with spec details).\n- End with a color key and a short rubric checklist confirming each criterion is met.\n\nOutput only the plan in concise plain text.", + "tail_service-health": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'service-health'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a \"service-health\" board: a dependency map of service tiles, each colored by health, connected by edges in dependency order. Output a concise plain-text plan naming SPECIFIC content (real values, labels, structure). Generic plans fail.\n\n== WHAT THE BOARD IS ==\nA health map showing each named service as a tile, arranged left-to-right (or top-to-bottom) in dependency order, with edges connecting dependent services. A headline states overall status and the answer to the reader's Goal; a legend maps colors to health states.\n\n== STRICT RUBRIC \u2014 satisfy EVERY criterion explicitly ==\n1. HEALTH COLOR per tile: healthy=green, degraded=amber, down=red. State the border/header color for every tile.\n2. INLINE SPARKLINE per tile: name the EXACT metric the sparkline plots AND its shape/trend. Be specific and varied \u2014 e.g. \"p95 latency line, steady ~210ms (green)\", \"error-rate line spiking 1%\u219271% (red)\", \"job-queue depth rising sharply (amber)\", \"latency flatlined / no-data (red)\". Every tile needs one; do not reuse a vague \"trend line.\"\n3. SLI VALUE in a meta line: a concrete current value with units and target, e.g. \"p95 214ms \u2713 (target <300ms)\", \"Error rate 71% \u25b2 (target <1%)\", \"Availability 0% (target 99.9%)\".\n4. ROLE/PURPOSE sub line: one concrete phrase describing what the service does, e.g. \"Front door \u2014 routes all search requests\", \"Encodes ABR renditions for streams\".\n5. DEPENDENCY EDGES: connect services in dependency order. CRITICAL \u2014 color each edge to match the HEALTH OF THE TARGET (the service the arrow points TO), not the origin. Never color an edge by the origin's health. State each edge and its color explicitly with the rule applied, e.g. \"Playback API \u2192 Transcoder: amber (target Transcoder degraded)\". Get this exactly right for every edge.\n6. LEGEND: maps the three colors to healthy/degraded/down (\ud83d\udfe2 Healthy \u00b7 \ud83d\udfe1 Degraded \u00b7 \ud83d\udd34 Down).\n7. GEOMETRY: no tiles overlap; edges route in clear lanes and never pass through unrelated tiles. ALL tiles and edges MUST stay on-canvas \u2014 keep the layout compact (a single row of 4\u20135 tiles can run off-canvas; if many services, wrap or stagger into a bounded grid rather than one long line). Explicitly plan positions so nothing goes off-canvas.\n\n== STRATEGY FOR HIGH COMPREHENSION (most important \u2014 comp is weighted heavily) ==\n- Lead with a HEADLINE BANNER that directly ANSWERS THE READER'S GOAL in plain words, not just an overall status. If the goal is to confirm a root cause, name the root cause in the headline (e.g. \"SEARCH DEGRADED \u2014 Index DOWN is root cause\"). If the goal is to pinpoint a component, name that component and why.\n- Make the problem tile unmistakable: mark it with a tag like \"ROOT\" or \"ACTION\", a thicker/glowing border, and put its breaching numbers front and center.\n- Keep all SLI values plausible and internally consistent with the stated health (down \u2192 0%/no-data/huge errors; degraded \u2192 breaching warning threshold; healthy \u2192 within target).\n- Add a short summary line that explains the causal chain (e.g. \"Backlog \u2192 slow renditions \u2192 playback complaints\") so the reader can act.\n- Prefer fewer, clearer elements over crowding. Crowded one-line flows hurt both comprehension and geometry.\n\n== OUTPUT FORMAT ==\nPlain text. Suggested structure:\n- Title (service + environment + time window, e.g. \"...\u2014 Production, US-East, last 5 min\")\n- Layout direction + overall flow as an ASCII sketch showing tiles and colored edges\n- Headline banner (answers the Goal)\n- Legend strip\n- One block per tile: name + color, role sub line, SLI meta line, sparkline (metric + trend)\n- Explicit edge list with target-health color for each\n- Short pinpoint/causal summary\n- A rubric self-check confirming each criterion is met\n\nBe concrete with real values, labels, and structure. Keep it compact and on-canvas.", + "tail_observability-dashboard": "\u2014 PLAN AN OBSERVABILITY DASHBOARD BOARD \u2014\n\nBoard: 'observability-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nTASK\nYou produce a concise plain-text plan for a mission-control observability board. The plan names the SPECIFIC content (real values, labels, structure) for each tile. Generic plans fail \u2014 every metric, axis, threshold, and table row must contain concrete sample data.\n\nSTRICT RUBRIC \u2014 satisfy EVERY criterion:\n- Panes are arranged as an independent-tile grid: a 2\u00d72 mission-control board of four peer tiles.\n- An SLO panel with ring progress and a status badge is present (a COMPONENT panel, not a chart).\n- A latency chart pane with percentiles (p50/p95/p99) is present.\n- A top-errors table and/or alert pane is present.\n- A throughput chart pane is present.\n- The grid MIXES component panels (ring/badge, table) and chart panels (line/area).\n- Each chart pane has a title AND named X and Y axis titles.\n\nLAYOUT (fixed 2\u00d72):\n- TILE 1 (top-left): SLO Panel [component: ring + badge]\n- TILE 2 (top-right): Latency Percentiles [chart: multi-line]\n- TILE 3 (bottom-left): Top Errors + Alert [component: table]\n- TILE 4 (bottom-right): Throughput [chart: area/bar]\nTiles read left-to-right, top-to-bottom.\n\nCRITICAL \u2014 AVOID GEOMETRY OVERFLOW (this is the main scoring problem):\nThe two prior attempts scored ~0.72\u20130.76 ONLY because content overflowed/clipped its tiles and comprehension dropped. Keep every tile compact so nothing clips:\n- Error table: MAX 4 rows + a short footer. Keep column count to 4\u20135 and cell text very short (e.g. status code + 2\u20133 word meaning, not full sentences).\n- SLO tile: at most 3 stat lines below the ring (availability, error budget, burn rate). No long sub-sentences.\n- Charts: 3 latency series max; 1\u20132 throughput series max. One short caption line only.\n- Avoid long explanatory captions and stacked sub-labels. Prefer terse labels.\n- Do not add extra rows, extra metrics, or paragraphs \"to be thorough\" \u2014 density causes clipping and lowers comprehension.\n\nCRITICAL \u2014 MAXIMIZE COMPREHENSION:\n- Make each tile instantly readable: clear title, one dominant value, clear color status.\n- Use a consistent color key: green = healthy/within target, amber = at risk, red = breached threshold. Use \u25b2/\u25bc for direction vs target.\n- Tie tiles together causally where natural (e.g. throughput dip aligns with the 503 spike timestamp) \u2014 but in ONE short caption, not extra panels.\n\nCONTENT GUIDANCE (use realistic domain values):\n- SLO Panel: ring shows availability % (e.g. 99.82%) against a target line (e.g. 99.90%); badge text like \"AT RISK\" amber or \"HEALTHY\" green; include error-budget remaining % and burn rate (\u00d7). Burn rate >1\u00d7 = budget draining.\n- Latency chart: title naming p50/p95/p99 and unit (ms); X axis = time window (e.g. \"Time (last 30 min)\"); Y axis = \"Latency (ms)\"; give concrete values per percentile; draw a dashed SLO ceiling threshold and note if breached.\n- Top-Errors table: an alert banner (red) for the active condition (e.g. \"5xx on /checkout/pay > 1% threshold\"), then a table ranked by count with columns like Status | Meaning | Endpoint | Count | Trend; realistic HTTP codes (503, 500, 429, 404, 401); highlight action-needed rows red.\n- Throughput chart: title with unit (req/s or orders/min); X axis = time window; Y axis = rate unit; current value, a normal band or capacity line, optional error-rate overlay.\n\nOUTPUT FORMAT:\n- Plain text plan, organized by the four tiles in order.\n- Start with a board title (service, environment, region, time window).\n- State the layout direction and 2\u00d72 grid explicitly.\n- For each tile: label its type (component vs chart), title, and concrete content.\n- End with a one-line color key and a brief rubric check confirming each criterion.\n- Keep it concise \u2014 favor terse bullet labels over prose.", + "tail_sre-incident": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sre-incident'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A header states the incident severity, status, and a one-line summary\n- A timeline of incident events (detect, escalate, mitigate, resolve) is shown\n- Impact metrics (affected users, error rate, duration) are shown as stats\n- Severity/status is color-coded (e.g. red for active, green for resolved)\n- Follow-up action items or next steps are listed\n- An incident commander/owner is named\n- The page has a clear incident title\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_dashboard/report.md b/scripts/experiments/gepa-flowchart/overnight/topo2_dashboard/report.md new file mode 100644 index 0000000..dd19062 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_dashboard/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: metrics-dashboard, service-health, observability-dashboard, sre-incident +shared skills: artifact_note, board_layout, dashboard_grid +shared skills CHANGED by GEPA: (none) + +**Seed 0.7116 -> Best 0.8408 (+0.1292)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_graph/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo2_graph/best_topology_skills.json new file mode 100644 index 0000000..7faf63c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_graph/best_topology_skills.json @@ -0,0 +1,11 @@ +{ + "graph_process_spine": "GRAPH LAYOUT (process flow): keep the happy path on one straight top-to-bottom spine; send branch/exception steps to the side with edges routed AROUND \u2014 never through \u2014 unrelated nodes. ~5-10 nodes. Label every edge with its trigger/condition and give it a visible arrowhead; color nodes by role via type:\"change\"+status and add a legend.", + "graph_zoned_tiers": "TASK\nYou produce a BOARD PLAN for a diagramming canvas. Input is a spec with fields like:\n- Journey/diagram type (e.g., \"cloud-architecture (flow)\")\n- Use case (the system to diagram)\n- Reader (audience, e.g., SRE)\n- Goal (e.g., onboard a new engineer)\n\nOutput a plan for a GRAPH LAYOUT with a ZONED ARCHITECTURE: group nodes into labeled zones/tiers via groups[]; show the external entry point and the request/data flow as directed edges between components; distinguish managed services from compute; add a legend. Keep zones from overlapping and route edges around unrelated nodes.\n\nCRITICAL \u2014 KEEP EVERYTHING ON CANVAS (this is the #1 failure mode)\n- Assume a single fixed canvas (treat it as ~1280\u00d7720, origin top-left, all coordinates >= 0 and within bounds). EVERY node, zone box, legend, and label must fit fully inside these bounds with margins (keep ~24px padding from all edges). Do not let any node, arrowhead, or label spill off-canvas.\n- Before finalizing, do an explicit bounds check: list each zone with its (x, y, width, height) and confirm none exceeds canvas bounds and none overlap. State \"bounds check: all N nodes on-canvas, 0 overlaps.\"\n- Budget space deliberately: count your nodes first, then size zones so the total stacked height/width fits. If there are many nodes (>10), use a more compact grid (multiple columns per tier) rather than one tall column that runs off the bottom.\n- Do NOT add a separate \"Internet\" zone floating above the cloud boundary if it pushes content off-canvas; place the external entry point as a node near the top inside the canvas margin.\n\nPRIORITIZE COMPREHENSION\n- The plan must be directly usable to build the board. Give each node a short, clear label and a zone assignment. Keep the structure simple enough that a reader achieves the stated Goal (e.g., a new engineer can trace the request flow at a glance).\n- Order zones along the flow direction (entry point first \u2192 data stores last). Keep flow direction consistent (default top-to-bottom).\n- Don't over-pack with detail (CIDRs, versions, AZs) at the expense of clarity; include only what serves the reader's goal. Detail is fine but must not crowd the layout or cause overflow.\n\nZONES / GROUPS\n- Use nested groups[] for tiers: outer cloud/account boundary (dashed), then VPC, then subnets/tiers (edge, compute, data).\n- Common zoned-architecture tiers for cloud/Kubernetes inputs:\n - Edge/Ingress tier: DNS (Route 53), WAF, Load Balancer / Ingress controller, NAT Gateway.\n - Compute tier: Kubernetes Services (ClusterIP), Deployments/Pods (frontend, api).\n - Data tier: managed DB (RDS PostgreSQL, Multi-AZ), cache (ElastiCache Redis), queue (SQS).\n- Maintain gutters (~40px) between zone boxes; zones must never overlap.\n\nDIRECTED EDGES (request/data flow)\n- Draw the path from external entry point through each tier to data stores, e.g.:\n End User/Browser \u2192 Route 53 \u2192 WAF \u2192 ALB/Ingress \u2192 Service \u2192 Pods \u2192 DB/Cache/Queue.\n- Include internal service-to-service calls (e.g., frontend pods \u2192 api service) and egress (pods \u2192 NAT Gateway, dashed).\n- Route edges around unrelated nodes (e.g., send api\u2192DB/cache/queue edges down one side) so lines don't cross over node boxes. Ensure edges + arrowheads stay within canvas bounds.\n\nDISTINGUISH NODE TYPES (use fill/color and reflect in legend)\n- Managed service (e.g., Route 53, WAF, ALB, NAT, RDS, ElastiCache, SQS): one distinct fill.\n- Compute / workload (pods, deployments, ingress controller): another fill.\n- Abstractions like K8s Service (ClusterIP): a third fill.\n\nLEGEND\n- Place a boxed legend in an empty corner (e.g., bottom-left) that fits fully on-canvas. Explain: managed-service fill, compute fill, abstraction fill, dashed outer box = cloud boundary, solid labeled boxes = VPC/subnet zones, solid arrow = request flow, dashed arrow = egress, entry-point icon = external internet.\n\nOUTPUT FORMAT\n- Markdown plan with sections: Layout & Title; Zones (groups[], nested, with x/y/w/h); Directed Edges (numbered); Legend; and a Fit/spacing notes section that includes the explicit bounds check (all nodes on-canvas, 0 overlaps, sized to one screen).", + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "graph_entity_lanes": "GRAPH LAYOUT (entity/relationship): each entity is a type:\"entity\" node with its attributes in data.fields[] (one {name,type,key?} per row, mark PK/FK) and an explicit width/height \u2014 NEVER cram fields into data.label (that makes one giant overlapping box). Arrange entities in columns; draw each relationship as a labeled edge with cardinality (1, 0..1, 1..*) and an arrowhead. Every entity participates in at least one relationship.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_er-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'er-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each table/entity is a titled box listing its fields as rows\n- Fields show their data types and key fields (PK/FK) are marked\n- Edges between entities are labeled with cardinality (e.g. 1..*, places, has)\n- Relationship edges have visible arrowheads/markers\n- The schema is laid out top-to-bottom\n- Entity boxes do not overlap and edges do not cross through unrelated entity boxes\n- Every entity participates in at least one labeled relationship (no orphan tables)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_class-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'class-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each class is a titled box showing attribute rows and method() rows\n- Method members are visibly distinguished from attributes (e.g. trailing parentheses)\n- Inheritance is drawn with parents above children (BT orientation)\n- Edges show inheritance/association between classes with arrowheads\n- Class boxes do not overlap and edges do not cross through unrelated boxes\n- The parent-child hierarchy is readable at a glance (a clear tree, not a tangle)\n- Every class connects to at least one other (no orphan class)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_state-machine": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'state-machine'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each distinct state is a clearly labeled node\n- Every transition edge carries a label naming the triggering event/action\n- Every transition edge has a visible arrowhead showing direction\n- Terminal/final state(s) are visually distinguished (e.g. green/done tone)\n- An initial/entry state is identifiable as the starting point\n- The diagram is laid out top-to-bottom\n- No transition edge passes through an unrelated state node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_cloud-architecture": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'cloud-architecture' (a flow/topology diagram)\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou output a concise plain-text PLAN that names SPECIFIC content (real service names, real labels, real coordinates). Generic plans fail.\n\n## RUBRIC \u2014 satisfy EVERY criterion (each is scored)\n- Resources grouped into labeled zones: a dashed CLOUD BOUNDARY containing solid TIER boxes (e.g., Edge/Ingress, Compute, Messaging/Async, Data/Storage). Name the real provider/region (e.g., \"AWS Cloud \u2014 us-east-1\", \"GCP Project: prod-shop / us-central1\").\n- Each resource is a labeled node showing its concrete service AND a logical name (e.g., \"Lambda: OrderApi\", \"Cloud SQL: PostgreSQL (Regional HA)\", \"DynamoDB: Orders table\").\n- Directed edges show request/traffic flow; number each edge and give it a short label (e.g., \"publishes orders\", \"trigger/poll\", \"presigned PUT\").\n- Show the internet/user entry point AND the cloud boundary explicitly.\n- Distinguish managed services (DB/queue/cache/storage) from compute via fill tones.\n- Include a legend mapping tones/line-styles to meaning.\n- No overlaps; edges avoid unrelated nodes (route via gutters).\n\n## CRITICAL: GEOMETRY (this is where plans fail \u2014 do not repeat past mistakes)\nPast plans scored ~0.12 because nodes landed OFF-CANVAS even though the plan TEXT claimed \"bounds check \u2713\". Self-reported checks were false because declared zone heights did not match the coordinates used. Avoid this:\n\n- Canvas is exactly 1280\u00d7720, origin top-left. Usable area after 24px margins: x\u2208[24,1256], y\u2208[24,696]. EVERYTHING must fit inside this \u2014 nodes, zones, legend, title, and the external entry node.\n- For EVERY rectangle (zone, node, legend) compute right=x+w and bottom=y+h. ASSERT right\u22641256 and bottom\u2264696. A zone's declared h MUST be the same value you use everywhere; never write h=320 in the zone list then treat it as 600 in the check.\n- Every node must sit fully inside its parent tier box with \u22658px padding on all sides: node.x \u2265 zone.x+8, node.y \u2265 zone.y+8, node.right \u2264 zone.right\u22128, node.bottom \u2264 zone.bottom\u22128. Verify this for each node.\n- Tier boxes stacked vertically must not overlap: each tier.bottom + \u226524px gutter \u2264 next tier.y. Prefer STACKED horizontal tiers (full-width rows top\u2192bottom) over side-by-side columns \u2014 it keeps math simple and avoids off-canvas drift.\n- Place the legend OUTSIDE the cloud boundary (e.g., a right column at x\u22481056, w\u2248200, fully within x\u22641256) OR a bottom strip \u2014 never overlapping any zone/node.\n- The external user/internet node goes in the top margin band (y\u224824\u201380) above the dashed cloud boundary, horizontally centered.\n- Keep the design small enough to fit: with ~4 stacked full-width tiers inside a cloud box of roughly x=40,y=96,w=1000,h=576, each tier is ~110\u2013130px tall. Do NOT exceed these \u2014 shrink node sizes/counts before pushing past the bounds.\n- End the plan with a per-rectangle bounds table (x,y,w,h,right,bottom) and explicitly state each right/bottom passes. The numbers must be internally consistent with the zone/node lists above \u2014 if they disagree, fix the layout, not the check.\n\n## CRITICAL: COMPREHENSION (also scored ~0.07 \u2014 fix it)\nThe plan must read as a clear, followable explanation of the architecture, not just a coordinate dump.\n- Lead with a 2\u20133 sentence prose summary of the end-to-end flow in plain language (what the user does \u2192 how the request travels \u2192 where data lands), naming the real services.\n- Keep edge labels meaningful so a reader can trace the request/event path without guessing.\n- Order tiers in the actual direction of traffic (entry \u2192 edge \u2192 compute \u2192 messaging \u2192 data) so reading top-to-bottom = following the flow.\n- Use realistic, coherent service choices for the stated provider and use case; the topology must tell one consistent story (e.g., a single request path + one async fan-out), not a random pile of services.\n\n## OUTPUT FORMAT\n1. Title (centered, includes use case + audience/goal hint).\n2. One-paragraph flow summary (prose).\n3. Zones list (cloud boundary \u2192 tiers) with exact x,y,w,h and labels.\n4. Nodes per zone with exact x,y,w,h, label (service + name), and tone (compute vs managed vs abstraction).\n5. Numbered directed edges with labels and routing notes (which gutter).\n6. Legend (tones + line styles + entry icon), with its own coordinates.\n7. Bounds table verifying every rectangle: right\u22641256, bottom\u2264696, and every node inside its tier.\n\nBe concrete and keep the geometry self-consistent. A plan whose coordinates fit the canvas and whose flow is easy to follow beats a verbose one that claims to fit but does not." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_graph/report.md b/scripts/experiments/gepa-flowchart/overnight/topo2_graph/report.md new file mode 100644 index 0000000..a07f801 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_graph/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: er-diagram, class-diagram, state-machine, cloud-architecture +shared skills: artifact_note, board_layout, graph_entity_lanes, graph_process_spine, graph_zoned_tiers +shared skills CHANGED by GEPA: (none) + +**Seed 0.4793 -> Best 0.5701 (+0.0907)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_report/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo2_report/best_topology_skills.json new file mode 100644 index 0000000..cfb10eb --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_report/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "report_rows": "REPORT LAYOUT: a cohesive top-down document read in order (rows), not a peer grid. Title each section and make sections visually distinct (heading + spacing). Embed REAL artifacts (a chart spec, a table, a map, an image) rather than describing them in prose.", + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "artifact_note": "You generate JSON UI specs that are RENDERED TO AN IMAGE and scored by an automated grader on four axes: comprehension, visual_quality, geometry (no overflow/clipping), and rubric (presence of each required item). Optimize for ALL four. The grader inspects the RENDERED image, so every required item must appear as its OWN discrete, conventionally-typed, labeled element \u2014 never as prose, never inside a custom/non-standard component the renderer may not understand, never merged into another field.\n\n## INPUT FORMATS\nYou receive one of two task shapes:\n\n1) COMPONENT task (e.g. \"recipe (component)\"): inputs are Journey, Use case, Reader, Goal. Output a single Mantine-style component tree: an object with `type`, `props`, `children`. Allowed/known component types include: Stack, Title, Text, Card, SimpleGrid, List (with List.Item), Table, Image, Alert, Badge, Blockquote. `children` may be a string, an object, or an array.\n\n2) PANES task (e.g. \"explainer (panes)\"): inputs are Journey, Use case, Reader, Goal. Output `{ \"layout\": <\"rows\"|\"cols\">, \"panes\": [...] }`. Each pane has `title`, `type`, `content`. Known pane types: \"markdown\", \"flow\" (nodes/edges), \"vegalite\" (chart spec).\n\n## RUBRIC: EMIT EACH REQUIRED ITEM AS A DETECTABLE ELEMENT\nThe grader scores each rubric item separately and gives 0.0 when it cannot detect that item in the rendered image. Past failures (numbered_steps 0.0, ingredients_list 0.0, nutrition_or_tips 0.0, quantitative_chart 0.0, misconception_alert 0.0, grounding_pane 0.2) were caused by using non-standard or non-detectable representations. Rules:\n\n- **Use ONLY the known component/pane types listed above.** Do NOT invent components like \"Checklist\" \u2014 they don't render and the item scores 0.0. (Example 2 used a `Checklist` for ingredients \u2192 ingredients_list scored 0.0.)\n- **Numbered steps** must be an ordered `List` with `props: {\"type\": \"ordered\"}` and one `List.Item` per step, rendered as a top-level visible section under a \"Steps\" Title. Do NOT nest the ordered list so deep that it clips. (Both recipe examples scored numbered_steps 0.0 \u2014 likely clipped/undetected; keep the steps section high in the layout and fully visible.)\n- **Ingredients** must be a real bulleted `List` of `List.Item`s (optionally grouped in Cards), each ingredient its own item with quantity.\n- **Nutrition / tips** must be present as their OWN labeled element: a `Table` (for nutrition, with head + body rows) AND/OR a titled `List` of tips. Give each its own header. Don't let it be the last thing that overflows off-frame.\n- **Charts (quantitative_chart)** in panes must be a `vegalite` pane that actually renders. Keep the spec SIMPLE: a single primary mark (line/bar) with clear x/y encodings. Avoid stacking many `layer`s with separate inline `data` blocks and text-annotation layers \u2014 over-complex multi-layer specs failed to render (scored 0.0). Put threshold context in the pane title or a single rule layer at most.\n- **Misconception / alert items** must use a clearly detectable callout: an `Alert` component (component task) or, in a pane, a markdown pane whose content is a blockquote. Make it visually distinct and self-contained. (A markdown blockquote alone scored misconception_alert 0.0 \u2014 prefer an explicit Alert-style structure and ensure it's not clipped.)\n- **Grounding pane** (real numbers/sources) must be its own pane with concrete cited figures in a small table or labeled list, not folded into other panes.\n\n## GEOMETRY: PREVENT OVERFLOW/CLIPPING (this capped every example)\nEvery example lost points to \"content overflows/clips.\" Be aggressively economical:\n\n- **Reduce total content volume.** Prefer fewer, denser sections over many stacked blocks. Each extra Card/section risks pushing later (often rubric-critical) content off-frame.\n- **Cap grid columns.** Do not use `cols: 5` of metric cards \u2014 it overflows horizontally. Use 3\u20134 max, or a single compact metadata row. Reserve vertical space for the rubric items (steps, ingredients, nutrition).\n- **Keep text short** in each element; avoid long parenthetical asides inside List.Items and step text.\n- **Order by importance:** put rubric-required sections (ingredients, numbered steps, nutrition/tips, charts, misconception) where they are guaranteed visible. Don't bury them after large images or wide grids.\n- **Limit images** to one, modest height (h \u2264 200), so they don't consume the frame.\n- **For panes:** keep each pane's content compact; long markdown tables + long blockquotes in the same board cause clipping. Trim rows.\n\n## GENERAL STRATEGY\n1. List the rubric items implied by the use case (e.g. recipe \u21d2 ingredients_list, numbered_steps, nutrition_or_tips; explainer \u21d2 quantitative_chart, misconception_alert, grounding_pane, mechanism/flow).\n2. For each, choose the simplest KNOWN, detectable element type and give it its own header/label.\n3. Lay out so all rubric items fit without clipping \u2014 trim everything non-essential first.\n4. Validate every component/pane uses only known types and renders simply.\n\nOutput ONLY the JSON spec, no commentary.", + "tail_recipe": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'recipe'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A clear ingredients list with quantities\n- Numbered preparation steps in order\n- Prep/cook time and number of servings are shown\n- Difficulty or skill level is indicated\n- Nutrition info or chef tips are included\n- A visual/photo of the dish is included\n- Title names the dish\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_explainer": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'explainer'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The top pane opens with a plain-language hook/surprising claim, not the abstraction\n- A flow pane shows the reasoning chain as color-coded nodes (postulate\u2192reasoning\u2192conclusion)\n- A vegalite pane shows the key quantitative relationship with a title saying what to notice\n- A component pane gives concrete numbers and a real-world anchor\n- A common misconception is addressed (an alert)\n- The panes are stacked top-down as a cohesive explainer (hook\u2192mechanism\u2192chart\u2192grounding)\n- Each layer is short, building intuition over symbols\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_algorithm-walkthrough": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'algorithm-walkthrough'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A markdown/code pane shows the annotated algorithm code\n- A component pane shows a step-by-step trace table of the state at each step\n- A vegalite pane shows a complexity curve for the algorithm\n- The three panes (code / trace / complexity) are visually distinct and labeled\n- The trace table rows are ordered by execution step\n- The complexity chart has a title and axis titles\n- Together the panes explain how the code works, not just show it\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_diy-project-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'diy-project-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A title plus badges (e.g. difficulty, time, cost) head the plan\n- A materials/tools table lists what's needed\n- An interactive checklist of build steps is present\n- At least one video tutorial reference card with a thumbnail is shown\n- A budget chart (VegaLite) breaks down the cost\n- The plan reads as a cohesive top-down document, not a grid of peers\n- A map or list of where to buy materials is included\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_report/report.md b/scripts/experiments/gepa-flowchart/overnight/topo2_report/report.md new file mode 100644 index 0000000..5c08046 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_report/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: recipe, explainer, algorithm-walkthrough, diy-project-plan +shared skills: artifact_note, board_layout, report_rows +shared skills CHANGED by GEPA: artifact_note + +**Seed 0.4243 -> Best 0.5302 (+0.1059)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_screen/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo2_screen/best_topology_skills.json new file mode 100644 index 0000000..323bc84 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_screen/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "screen_frame": "TASK: You produce a BOARD PLAN (a written spec) for a UI mockup that another tool will render visually. Read the inputs (Journey, Use case, Reader, Goal) and output a clean, structured plan that renders as a real, polished app screen \u2014 NOT a diagram.\n\nTwo journey types:\n- \"app-screen-mockup (component)\" \u2192 ONE single mobile frame.\n- \"screen-set (panes)\" \u2192 MULTIPLE peer mobile frames in one titled set.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#1 PRIORITY \u2014 RESPECT THE MOBILE BUDGET (prevents overflow/clipping)\nContent overflow is the #1 scoring failure. A real phone frame (~390\u00d7844) fits far LESS than you think. BE RUTHLESS \u2014 underfill rather than overfill. Per frame, the TOTAL of all content must stay within:\n- Status bar + header (1 line title) \u2014 counts toward budget\n- THEN choose ONE of these content shapes, not several:\n \u2022 2 cards max (3 only if each is tiny), OR\n \u2022 2\u20133 list rows max, OR\n \u2022 1 short form (3 input fields max)\n- Plus ONE primary CTA.\nHARD CAPS: no more than ~7 total content blocks in a frame; no card with more than 4 inner lines; NEVER use multi-line \"Order summary\" / itemized receipts / long dotted-leader tables \u2014 these always clip. If the use case implies lots of detail (checkout, summary), SUMMARIZE aggressively (e.g. \"2 items \u00b7 $164.93\" on one row) instead of listing every line.\nA bottom tab bar is optional; if used it counts as a block.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#2 \u2014 REALISTIC, POLISHED APP LOOK (not a diagram)\n- Always include: a status bar (e.g. \"9:41\" + signal/wifi/battery), a top app bar/header with a real screen title, a primary content area, and ONE prominent CTA.\n- Use realistic placeholder content: real-sounding names, emails, prices, copy \u2014 never \"Lorem ipsum\" or \"Label 1\".\n- Consistent rounded styling: cards 12\u201316px radius, inputs 12px radius, buttons 10\u201312px radius; 16px side margins; 12px gaps between cards.\n- Light neutral background (#F5F6F8 or #F2F2F7), white cards, soft shadow, one accent color used sparingly.\n- Provide a font hierarchy note (header bold ~17pt, section labels small uppercase gray, body ~14px).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#3 \u2014 PRIMARY CTA MUST BE UNMISTAKABLE (rubric: primary_cta)\n- Exactly ONE dominant CTA per frame: full-width, solid accent-color fill, bold white text, rounded.\n- Make it the visually largest/boldest interactive element. Secondary actions must be plain text links, clearly subordinate (smaller, no fill).\n- Place it prominently (sticky bottom bar or near bottom of content).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#4 \u2014 FOR SCREEN-SETS (rubric: multiple_screens, realistic_screens, titled_set)\n- Give the whole set a clear TITLE BANNER at top (e.g. \"Sign-Up Flow \u2014 Acme App \u00b7 3 Steps\") and a caption above each frame (\"Step 1 of 3 \u00b7 Create Account\").\n- Lay frames LEFT-TO-RIGHT as peer tiles, connected by labeled arrows showing transition direction.\n- Each frame must be DISTINCT and individually realistic \u2014 different content/state per step (each obeying the mobile budget above), not near-duplicates. Show state progression (e.g. progress dots \u25cf\u25cb\u25cb \u2192 \u25cf\u25cf\u25cb \u2192 \u25cf\u25cf\u25cf, success-green CTA on a final/done screen).\n- Keep shared chrome consistent across frames (same phone outline, status bar, header style, input style, accent color).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\nOUTPUT FORMAT\n- Start with \"# BOARD PLAN: <title>\".\n- State orientation (single frame top-to-bottom, or set left-to-right) and global styling up front.\n- Number the sections of each frame top\u2192bottom.\n- End with a short hierarchy/consistency note.\n- Confirm the content fits one screen with no clipping (\"one screen, no overflow\").\n\nREMEMBER: A sparse, clean frame that clearly reads as a real app scores far higher than a dense, accurate-but-clipped one. When in doubt, cut content.", + "board_layout": "You produce a BOARD PLAN: a text spec describing a single-screen (or multi-frame) visual layout \u2014 design-system showcase, wireframe, dashboard, component kit, app-screen mockup, screen-set, or report. Input gives Journey/use-case, Reader, and Goal.\n\nTOP PRIORITY \u2014 COMPREHENSION + FITTING ONE SCREEN. Plans are scored on (a) how clearly a reader grasps WHAT is shown and WHY, and (b) whether content fits without overflowing/clipping. Past plans scored badly because they OVERFILLED (geometry clipped on nearly every attempt) and buried meaning under styling chrome (hex codes, shadows, fonts, radii, status bars, \"9:41\", letter-spacing, device outlines). The single best-scoring plan was the LEANEST one. So: cut decoration, cut element count, foreground meaning.\n\n=== CRITICAL: PREVENT OVERFLOW ===\nContent overflow was the #1 failure across every example. Be aggressive about keeping it small:\n- HARD CAP per single mobile frame: about 6\u20137 stacked regions MAX, each 1\u20133 short lines.\n- DELETE non-essential chrome entirely: NO status bar / \"9:41\" / signal-wifi-battery, NO device outline, NO repeated styling boilerplate. These consume budget and add zero comprehension.\n- For a screen-set / multi-frame board: even FEWER elements per frame (4\u20135 max), because multiple frames share one screen. Make each frame deliberately sparse.\n- When in doubt, underfill. Prefer empty space over a dropped/clipped element.\n\n=== STYLING: ONE LINE ONLY ===\nGive exactly ONE brief global-style line (e.g. \"Low-fi greyscale, one indigo accent on the primary CTA, top-to-bottom stacking, generous gaps\"). Do NOT list hex palettes, shadow specs, font point-sizes, radii, margins-in-px, or repeat styling per frame. Styling is a thin supporting layer, never the focus. Note: visual-quality already scores high; spending words on styling only steals from comprehension and overflows the screen.\n\n=== STRUCTURE ===\n1. TITLE & ORIENTATION: One title naming subject AND scope. State layout direction explicitly (default top-to-bottom). State it's one screen with no clipping.\n - Multi-section report \u2192 stacked rows read in order.\n - Dashboard of peer tiles \u2192 balanced 2\u00d72 grid.\n - Component/design-system showcase or wireframe \u2192 single frame, sections top-to-bottom.\n - Screen-set / multi-frame \u2192 peer frames; see flow rules below.\n\n2. LABEL PURPOSE FOR EVERY REGION (biggest comprehension driver). For each region write a short plain-English statement of what it is and what it communicates: \"HEADER \u2014 global nav + user menu\", \"LIST \u2014 table of records; tap = detail\". Make each region's function obvious to a skimmer. Every region gets a purpose line.\n\n3. REAL, SPECIFIC CONTENT over placeholders. Use concrete named values the reader can read and act on. Name each component WITH its state/usage: \"Text input (error): 'Password' + helper 'Min. 8 characters'\", \"Toggle (ON): 'Notifications'\". Concrete data > visual flourish.\n\n4. COVER EVERY RUBRIC ELEMENT the use-case implies, each kept tiny:\n - Form/input kit: fields, selects, toggles, validation/error states, button variants, labels.\n - Design-system/component spec: typography scale (name 2\u20133 type roles concisely, e.g. \"Heading 17 / Body 14 / Caption 12\"), component variants (show default + at least 2 more states, e.g. default/hover/disabled), form elements, spacing/grid note \u2014 each must appear, none skipped (these were marked weak when underspecified).\n - Wireframe: header, nav/sidebar, content regions, list/table, primary CTA, footer.\n Keep every required element present but minimal so all fit one screen.\n\n=== SCREEN-SET / MULTI-FRAME RULES (these scored worst \u2014 fix them) ===\n- ANNOTATED FLOW is required and was the weakest item: between every pair of frames draw a labeled connector naming the exact triggering action and direction, e.g. Frame1 \u2014[\"Get Started\"]\u2192 Frame2 \u2014[\"Allow & Continue\"]\u2192 Frame3. Also annotate the RESULT/state-change each transition causes (\"permission granted \u2192 unlocks task entry\"). Make the forward flow unmistakable.\n- MULTIPLE SCREENS: clearly delineate each frame with its own caption (\"Step 2 of 3 \u00b7 Permissions\") and a per-frame progress indicator. Ensure ALL frames are fully shown \u2014 never let the last frame clip. Keep each frame to 4\u20135 elements so the set fits.\n\n5. CLOSE with one hierarchy/consistency line: name the dominant element, confirm regions are clearly distinguished, and confirm everything fits one screen with no overflow.\n\nWrite so a reader instantly grasps each region's content and purpose. Optimize for: concrete content + per-region purpose labels + strict element economy that guarantees one-screen fit + (for sets) explicit annotated flow. Treat styling as one minimal line.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_app-screen-mockup": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'app-screen-mockup'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Looks like a real polished app screen, not a diagram\n- A header / top app bar / nav is present\n- A clear primary content area (cards/list/feed/form) fills the screen\n- A prominent primary call-to-action button/action is present\n- Clear visual hierarchy via headings, sections and spacing\n- Realistic placeholder content (names, values, labels), not lorem stubs\n- Consistent polished styling (colors, spacing, rounded cards)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_wireframe": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'wireframe'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Low-fidelity look: greyscale boxes/placeholders, minimal color or imagery\n- Layout regions are blocked out (header, sidebar, content, footer)\n- Placeholder text/image blocks stand in for real content\n- Annotations/notes explain regions or interactions\n- Elements align to a clear grid/structure\n- Key UI elements (nav, buttons, inputs, lists) are indicated as boxes\n- Titled with the screen/page name\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_design-system": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'design-system'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A color palette with swatches (primary/secondary/semantic) is shown\n- A typography scale (headings/body) is shown\n- Core components shown in their variants (e.g. button states, badges)\n- Form elements (inputs, toggles, selects) are displayed\n- Spacing/grid or sizing tokens are documented\n- Each element/section is labeled with its name/usage\n- A consistent visual brand across the showcase\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_screen-set": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'screen-set'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Several distinct app screens are shown side by side\n- The screens tell a sequence/flow (step 1 \u2192 2 \u2192 3)\n- Each screen/frame has a title/caption\n- Each frame looks like a real screen, not a diagram\n- Consistent visual style across the frames\n- The flow/transition between frames is indicated\n- The set has an overall title naming the flow\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo2_screen/report.md b/scripts/experiments/gepa-flowchart/overnight/topo2_screen/report.md new file mode 100644 index 0000000..0e904fe --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo2_screen/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: app-screen-mockup, wireframe, design-system, screen-set +shared skills: artifact_note, board_layout, screen_frame +shared skills CHANGED by GEPA: board_layout + +**Seed 0.5410 -> Best 0.6571 (+0.1161)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_chart/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo3_chart/best_topology_skills.json new file mode 100644 index 0000000..c2bbc27 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_chart/best_topology_skills.json @@ -0,0 +1,19 @@ +{ + "chart_internal": "CHART: emit a COMPLETE inline Vega-Lite spec (never a description). Title + subtitle; axis titles WITH units; set width to \"container\". Sort categories explicitly (usually descending). Put value labels on marks and a legend when there are multiple series; make the key insight obvious.", + "board_layout": "You generate a \"BOARD PLAN\" plus a renderable Vega-Lite v5 spec for a single-screen data visualization (\"board\"). \n\nINPUT FORMAT (free text fields):\n- Journey: the chart/board type (e.g. confusion-matrix, ablation-study) and renderer (vegalite)\n- Use case: the specific scenario/data the chart depicts\n- Reader: the audience\n- Goal: the analytic question the reader must answer at a glance\n\nYOUR OUTPUT: a markdown plan (layout, title/scope, content table, rubric mapping) followed by ONE valid Vega-Lite v5 JSON spec. Invent realistic, specific values (real-sounding model names, datasets, plausible numbers) \u2014 never placeholders.\n\n=== CRITICAL CORRECTNESS RULES (failures here tanked scores) ===\n\n1. VALID SCHEMA URL \u2014 always use exactly:\n \"$schema\": \"https://vega.github.io/schema/vega-lite/v5.json\"\n Do NOT invent variants like \"vega-lite.github.io\", \"vega-lite.org\", or \".v5.json\". A wrong schema URL breaks rendering and zeros visual quality.\n\n2. NO CLIPPING / OVERFLOW \u2014 the whole board must fit one screen unclipped.\n - Keep height modest (\u2248300\u2013380) and use \"width\": \"container\".\n - When you draw value labels ABOVE bars (dy negative), the top labels get cut off if the scale ends at the data max. Either add headroom to the y-scale domain (e.g. domain [0, 110] for a 0\u2013100 metric, or extend the max ~15% above the largest value) OR place labels inside/below the bar. Never let text marks render outside the plotting area.\n - Avoid stacking multiple offset text layers (value + delta + baseline label) at the same anchor \u2014 they collide and overflow. Prefer ONE consolidated label per bar.\n\n3. EVERY RUBRIC ELEMENT MUST ACTUALLY RENDER IN THE SPEC, not just be described in prose. Layers that reference fields not in the data, or use `detail`/sort mismatches, silently fail. Verify each text/rule/mark layer binds to real data fields. If you claim \"delta shown\" or \"baseline reference\", that exact mark must appear and be visible.\n\n=== BOARD LAYOUT RULES ===\n\n- Lay the board top-to-bottom; state the read direction explicitly.\n- TITLE must name the subject AND its scope (model + dataset + metric). Use `\"anchor\": \"start\"`. Put the key takeaway/insight in the subtitle.\n- Use real, specific values everywhere.\n- Multi-section boards: use rows for an ordered report, a balanced 2\u00d72 grid for a peer-tile dashboard; title each pane.\n- Include EVERY element the rubric requires; keep each concise so nothing is dropped or clipped.\n\n=== CHART-TYPE PLAYBOOKS ===\n\nCONFUSION MATRIX (this pattern scored highest \u2014 replicate it):\n- N\u00d7N rect heatmap: actual classes as rows (y, top-to-bottom), predicted as columns (x). Use matching `sort` arrays on both axes so order is consistent.\n- One text mark per cell printing the count; flip text color to white on dark cells (e.g. test datum.count > threshold).\n- Sequential color scale (e.g. \"blues\") with domain [0, max]; legend titled (e.g. \"Review count\").\n- Make the diagonal distinguishable (e.g. thick gold stroke #FFB000 strokeWidth 3 on diagonal, faint grey elsewhere) via a \"diag\" boolean field.\n- Both axes titled with class names; surface the biggest confusion pair in the subtitle.\n\nABLATION STUDY (these scored LOW \u2014 be especially careful):\n- ONE bar per ablated variant (component_bars). Each variant = removing one component; include the metric value for each.\n- SORTED BY IMPACT: order bars so the most-damaging ablation is first/leftmost. For \"lower is better\" metrics (perplexity) sort DESCENDING; for \"higher is better\" (success rate) sort so the biggest drop is leftmost (ascending success). Apply the SAME sort spec on every layer that uses that axis.\n- BASELINE REFERENCE: draw a dashed `rule` mark at the full-model baseline value with a visible text label (e.g. \"Baseline (full): 78%\"). This must render \u2014 keep the label inside the plot bounds.\n- DELTA SHOWN: make each bar's change vs baseline explicit and VISIBLE \u2014 prefer a single per-bar label combining value and delta (e.g. \"41% (\u221237 pp)\"). Ensure the label fits (add y-scale headroom).\n- Y axis titled with the metric and its direction (e.g. \"Task success rate (%)\", \"Validation Perplexity (PPL, lower = better)\") with a fixed sensible domain.\n- Color bars by impact magnitude so \"what matters most\" reads instantly.\n- Title names the model + metric.\n\n=== AFTER THE SPEC ===\nAdd a short \"rubric mapping\" listing each required element and where it is satisfied. Double-check the JSON is syntactically valid, the schema URL is correct, no text marks fall outside the plot, and every described element exists as a real, data-bound mark.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_complexity-growth": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'complexity-growth'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Multiple growth classes/implementations are plotted as distinct labeled lines over input size\n- The y-axis uses a log/symlog scale so all growth classes are visible together\n- Data points are marked on the lines (point:true)\n- Both axes have titles (e.g. input size n / operations)\n- A legend identifies each growth class/series\n- The chart has a descriptive title and subtitle\n- Hovering shows tooltip values (tooltip enabled)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_calendar-heatmap": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'calendar-heatmap'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The chart is a grid with week index on one axis and day-of-week (Mon-Sun) on the other\n- Each cell is a discrete rectangle with small gaps between cells\n- Cell color encodes the count on a sequential scale where zero is the palest cell\n- The chart has a title and subtitle naming the activity and period\n- Tooltip shows the full date plus the count for a cell\n- Axis labels are thinned (not every cell labeled) so the grid stays compact and readable\n- A color legend maps shades to count ranges\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_correlation-heatmap": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'correlation-heatmap'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The chart is a square N\u00d7N matrix with identical sorting on both axes\n- The numeric value is printed inside each cell (text layer over rect)\n- Cell background color encodes the value (diverging scale for correlation, sequential for latency)\n- Cell text stays legible on every cell (light text over dark/hot cells, dark over light)\n- Both axes are titled naming the two dimensions\n- The chart has a title and subtitle\n- Tooltip shows both keys and the value for a cell\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_gantt": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'gantt'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each task is a horizontal bar spanning its start date to end date\n- A vertical 'today' rule line is present, with a label\n- Bars are colored by an explicit status scale (done/active/blocked/planned)\n- Tasks are sorted by start date down the y-axis\n- The x-axis is a dated/temporal time axis\n- Tooltip shows task, status, owner, and dates\n- The chart has a title and subtitle\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_roadmap": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'roadmap'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each initiative is a bar spanning quarter-start to quarter-end\n- The x-axis is formatted by quarter (e.g. Q1 '26), not by day\n- Bars are colored by team/owner\n- A legend maps colors to teams\n- It reads as a strategic roadmap (coarse quarters), not a day-level Gantt with a today rule\n- The chart has a title and subtitle\n- Tooltip shows the initiative, team, and quarter span\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_sprint-burndown": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sprint-burndown'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Two lines are plotted: an ideal burndown and the actual remaining work\n- The ideal line is a straight dashed/grey line from total down to zero\n- The actual line is a distinct color with marked points and stops at today\n- A vertical 'today' rule line marks the current sprint day\n- The y-axis is fixed from 0 to the sprint total so both lines share a frame\n- Axes are titled (sprint day / story points remaining)\n- A legend distinguishes the ideal vs actual series\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_trace-waterfall": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'trace-waterfall'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each span is a horizontal bar from its start ms to end ms\n- Spans are sorted by start time down the y-axis so they cascade\n- Spans are colored by service as a second channel (in addition to y-position)\n- The x-axis domain is fixed from 0 to the total trace duration\n- Parent spans visibly bracket the time range of their children\n- Tooltip shows span, service, start, and duration\n- The chart title/subtitle states the request and total duration\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_growth-funnel": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'growth-funnel'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Funnel stages are shown as bars sorted from widest (top of funnel) to narrowest\n- Each stage shows its count (as a bar length and/or printed value)\n- Each bar is labeled with its numeric count\n- The count axis is titled and the stage axis labels each stage\n- The chart has a title and subtitle (e.g. period covered)\n- Tooltip shows the stage and count\n- Stages read in funnel order from top to bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_training-curves": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'training-curves'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Separate train and validation curves are both plotted\n- X axis is the training step/epoch with a clear axis title\n- Y axis is the loss/metric with a clear axis title\n- The train/val divergence (overfitting gap) is visible and legible\n- The best checkpoint / early-stop point is marked (rule or point)\n- A legend distinguishes the train vs validation series\n- Chart has a descriptive title and subtitle naming the run\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_confusion-matrix": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'confusion-matrix'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- An N\u00d7N grid of actual (rows) vs predicted (columns) classes\n- Each cell shows its count or rate as a text label\n- Cells are color-encoded by value with a sequential scale\n- Both axes are labeled (actual vs predicted) with the class names\n- The correct-prediction diagonal is distinguishable from off-diagonal errors\n- A color legend/scale is present\n- Title names the classifier and dataset\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_ablation-study": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'ablation-study'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- One bar per ablated component/variant\n- The full/baseline model is shown as a reference bar or rule\n- The drop/delta from removing each component is clear\n- Y axis is the eval metric with a clear axis title\n- Variants are ordered by impact on the metric\n- Bars are labeled with their metric values\n- Title names the model and the metric being ablated\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_infra-cost-breakdown": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'infra-cost-breakdown'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A bar (or stacked bar) chart of cost by service/resource category\n- The cost axis has a title and currency units\n- Each category/service is clearly labeled\n- Categories are sorted by cost (largest first)\n- A total and/or per-bar value labels are shown\n- The top cost driver is visually highlighted\n- Title names the account/environment and the period\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_home-budget": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'home-budget'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A chart of spending by category\n- Income vs total spending (or savings) is shown\n- The money axis has a title and currency units\n- Categories sorted by amount or shown as proportions\n- Category values or percentages are labeled\n- The largest category is highlighted\n- Title names the household and the month\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_generic-vegalite": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'generic-vegalite'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The chart type fits the data and the question being asked\n- Title, subtitle and axis titles are all present\n- Scales and units are readable and appropriate\n- A legend is present when multiple series/categories exist\n- The chart makes the key insight visually obvious\n- Tooltips or labels give additional context\n- Not overcrowded; fits the canvas cleanly\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_chart/report.md b/scripts/experiments/gepa-flowchart/overnight/topo3_chart/report.md new file mode 100644 index 0000000..1ee861a --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_chart/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: complexity-growth, calendar-heatmap, correlation-heatmap, gantt, roadmap, sprint-burndown, trace-waterfall, growth-funnel, training-curves, confusion-matrix, ablation-study, infra-cost-breakdown, home-budget, generic-vegalite +shared skills: artifact_note, board_layout, chart_internal +shared skills CHANGED by GEPA: board_layout + +**Seed 0.6044 -> Best 0.6744 (+0.0700)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_comparison/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo3_comparison/best_topology_skills.json new file mode 100644 index 0000000..15d5c41 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_comparison/best_topology_skills.json @@ -0,0 +1,11 @@ +{ + "comparison_grid": "COMPARISON LAYOUT: equal-width option cards side by side with IDENTICAL aligned rows; add a delta column (arrow + sign + value + %) where it applies; flag the best/recommended option with a badge (tone grey\u2192green). For products, show an image and an outbound link per option. Limit to the 3-4 most relevant options and keep each card compact (small image, ~4-5 spec rows) so the row fits on one screen without horizontal overflow.", + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_before-after": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'before-after'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a single-screen visual board that compares a \"Before\" state and an \"After\" state side by side. Output a concise plain-text plan that names SPECIFIC, REAL content (concrete values, labels, structure) satisfying every rubric criterion. Generic plans fail.\n\nSTRICT RUBRIC \u2014 satisfy EVERY criterion:\n- A two-column layout shows a Before card and an After card side by side (equal width, aligned rows)\n- Each card contains a table of the SAME metrics in the SAME order for direct comparison\n- The After card includes a delta (\u0394) column showing the change\n- A headline badge shifts color (grey baseline \u2192 green) to signal improvement\n- The cards are clearly labeled literally \"Before\" and \"After\"\n- Metric rows align across both cards so values are directly comparable\n- The direction of improvement is visually clear (color + sign + arrow on deltas)\n\nREQUIRED STRUCTURE (follow this proven layout):\n\n1. TITLE BAR \u2014 a specific, descriptive title naming the subject and time/version range (e.g. \"Cloud Cost Reduction \u2014 AWS Production (Q2 \u2192 Q3 2024)\").\n\n2. HEADLINE BADGE (color-shifting) \u2014 a single full-width pill, GREEN (shifted from grey baseline), stating the single most important improvement with a direction arrow, absolute amount, and percent (e.g. \"\u25bc 38% Monthly Spend Reduced \u2014 $17,200/mo saved\"). Note the grey\u2192green shift signals net improvement at a glance.\n\n3. TWO-COLUMN COMPARISON (equal-width cards, identical aligned rows):\n - CARD 1 \u2014 \"BEFORE\": label badge in GREY reading \"Before \u00b7 <date/version>\". Table with two columns: Metric | Value. Use 4 concrete metrics with realistic values.\n - CARD 2 \u2014 \"AFTER\": label badge in GREEN reading \"After \u00b7 <date/version>\". Table with three columns: Metric | Value | \u0394 Change. The SAME 4 metrics in the SAME order. Each delta cell shows: color dot (\ud83d\udfe2) + direction arrow (\u25bc or \u25b2) + sign + absolute change + (percent). The \u0394 column appears ONLY on the After card.\n\n4. RUBRIC SATISFACTION NOTES \u2014 a short bullet list mapping each rubric criterion to how the plan meets it.\n\nCONCRETENESS RULES:\n- Invent realistic, internally-consistent numbers. Compute deltas correctly (absolute and %).\n- Choose the arrow/color by what counts as improvement for each metric: if lower is better, use \u25bc + minus; if higher is better, use \u25b2 + plus. Either way, color green when it is an improvement.\n- Keep all rows in identical order and identical count across both cards so they line up horizontally.\n- Design to fit one screen, no scroll; read left (Before) \u2192 right (After).\n\nFocus on maximizing reader comprehension: make the headline number and the direction of every delta instantly understandable. Output the plan in clean markdown using tables and headers as shown.", + "tail_product-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'product-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each option is its own card in a grid\n- Each option shows a prominent product image/logo (not a tiny letter)\n- Each option shows its price\n- Each option shows a star rating or score badge\n- Each option lists its key features\n- Each option has a clickable outbound 'Visit site \u2192' link\n- A standout/recommended option is flagged with a badge (e.g. 'Popular')\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_shopping-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'shopping-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each shopping option is its own card in a comparison grid\n- Each option shows a prominent product image\n- Each option shows its price\n- The cheapest (or best-value) option is flagged with a badge\n- Each option lists key specs/features for comparison\n- Each option has an outbound buy/visit link\n- Each option shows a rating or review score\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_flight-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'flight-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each flight option is its own card/row in a comparison\n- Each flight shows its total price\n- Each flight shows departure/arrival times and total duration\n- Each flight shows its number of stops (nonstop vs layovers)\n- The cheapest or best-overall option is flagged with a badge\n- Each flight shows its airline (logo or name)\n- Each flight has an outbound book/select link\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_model-leaderboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'model-leaderboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each model is its own row in a comparison table\n- Multiple eval metrics are shown as columns\n- The best value per metric (or the winning model) is flagged with a badge\n- Model size/params or cost/latency is shown for each model\n- Models are ranked/sorted by a primary metric\n- An overall winner or aggregate score is indicated\n- Title states the benchmark/dataset the models were evaluated on\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_trip-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'trip-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each destination/option is its own card\n- Each option shows a representative image\n- Each option shows an estimated cost\n- Weather or best-season info per option\n- Key highlights/activities per option\n- A recommended/best-value option is flagged\n- Each option has an outbound link to book or learn more\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_comparison/report.md b/scripts/experiments/gepa-flowchart/overnight/topo3_comparison/report.md new file mode 100644 index 0000000..d47a8e2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_comparison/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: before-after, product-comparison, shopping-comparison, flight-comparison, model-leaderboard, trip-comparison +shared skills: artifact_note, board_layout, comparison_grid +shared skills CHANGED by GEPA: (none) + +**Seed 0.7921 -> Best 0.8527 (+0.0606)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_dashboard/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo3_dashboard/best_topology_skills.json new file mode 100644 index 0000000..66f2dc1 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_dashboard/best_topology_skills.json @@ -0,0 +1,11 @@ +{ + "board_layout": "TASK: You design a single-screen \"board plan\" \u2014 a specification for a visual board (dashboard, report, map, or component) that another system will render. You receive these inputs:\n- Journey: a slug + type in parens, e.g. \"sre-incident (component)\", \"service-health (flow)\", \"task-progress (component)\".\n- Use case: a one-line description naming the subject and EVERY element the board must contain.\n- Reader: who reads the board (set tone/scope to them).\n- Goal: the single insight or action the reader must get.\n\nYou are scored on four axes: comprehension (does the layout make the goal obvious at a glance), visual_quality, geometry (everything must fit one screen \u2014 NO overflow, clipping, or off-canvas tiles), and rubric (every required element from the use case must be present and substantive). Optimize all four.\n\nOUTPUT FORMAT\n- Start with a title line/header naming the subject AND its scope (e.g. specific service, person, incident ID, time window, and reader view).\n- State the layout explicitly: orientation (lay top-to-bottom and say so), and the structure \u2014 rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles, a layered top-to-bottom DAG for a dependency/flow map.\n- Title every pane/section.\n- Use real, specific values everywhere \u2014 names, timestamps, metrics, IDs, thresholds. NEVER use placeholders.\n\nGEOMETRY \u2014 THE #1 FAILURE MODE (most boards lose points here; fix it aggressively):\n- The whole board MUST fit one screen with zero clipping and zero off-canvas elements. This overrides completeness of detail.\n- Budget space BEFORE writing. A single screen holds roughly 4\u20136 compact rows OR a 2x2 grid OR ~5\u20136 DAG tiles. Do not exceed this.\n- Keep each element terse: short labels, 1 line of meta per item, abbreviated values. Cut prose, verdict paragraphs, and footnotes that add height.\n- CAP list/table/timeline lengths. A table should be ~5\u20138 rows max; a timeline ~4\u20135 events; a checklist ~4\u20135 items. If the use case implies more, summarize/group rather than list every item \u2014 never let content grow past the screen.\n- For flow/DAG maps: place the entry point top-center; arrange sibling dependencies in side-by-side column lanes; keep every tile inside the canvas bounds (no tile pushed off the edge). Route edges within lanes so none crosses an unrelated tile. Color edges by the TARGET node's health.\n- Prefer fewer, denser panes over many stacked panes. If you have many required elements, merge related ones into one compact pane instead of adding rows.\n\nCOMPREHENSION (don't let geometry trimming hurt the goal):\n- Make the answer to the Goal jump out: lead with the headline state/verdict, use a clear color-coded status system (\ud83d\udfe2 healthy/done, \ud83d\udfe1 amber/degraded/at-risk, \ud83d\udd34 red/down/blocked), and put the most decision-relevant element first.\n- For at-a-glance maps, ensure the failure/root-cause path is visually traceable (colored chain from symptom to cause) \u2014 but keep it inside the canvas.\n- Use concrete thresholds so states are justified (e.g. \"p95 > 300ms = SLO breach\").\n\nRUBRIC \u2014 INCLUDE EVERY NAMED ELEMENT:\n- Parse the use case and reader/goal for each required component and include ALL of them as real, substantive sections \u2014 even space-constrained, never drop one.\n- Common component types imply specific required panes; if the use case (or journey slug) implies them, include them and give each real content:\n - Incident boards: severity, timeline, impact metric (e.g. p95), IC/owner, action items.\n - Health/dependency maps: each named service as a tile with status, the down/degraded ones flagged, edges showing the chain, a legend.\n - Progress/task components: overall completion, per-workstream breakdown, blockers, work-items, a RECENT ACTIVITY log (timeline of what happened), and a NEXT UP / upcoming section \u2014 the activity and next-up panes are mandatory and must contain real dated entries, not be omitted or thinned to nothing.\n- Always include a compact legend/color key when you use a status color scheme.\n\nPRIORITIZATION: First guarantee everything fits one screen (geometry); second include every required element (rubric); third make the goal obvious (comprehension). Trade verbosity and list length \u2014 never required elements \u2014 to satisfy geometry.", + "dashboard_grid": "DASHBOARD LAYOUT: a balanced grid of peer tiles \u2014 include every panel the rubric names. Use stat cards (dim label / large value) with a polarity badge tied to a named threshold; set embedded charts' width to \"container\". Keep each tile concise; never drop a required panel to save space.", + "artifact_note": "TASK\nYou generate UI layouts as JSON for a rendering engine. The rendered image is then scored by an automated grader on four axes: comprehension (does every required item read clearly), visual_quality, geometry (does anything overflow/clip), and rubric. Your job is to produce a layout that maximizes ALL of these.\n\nINPUT FORMAT\nEach task gives you:\n- Journey: a template name plus a mode in parentheses \u2014 either \"(component)\" (emit a Mantine-style component tree using {\"type\",\"props\",\"children\"}) or \"(panes)\" (emit {\"layout\",\"panes\":[...]} where each pane is {\"title\",\"type\",\"content\"} and type is one of \"markdown\" or \"vegalite\").\n- Use case: the concrete scenario, listing the specific items/sections that MUST appear.\n- Reader: the audience.\n- Goal: what the reader needs to accomplish.\n\nCORE RULE\nEmit every required item from the Use case as its OWN labeled, discrete element (its own header, card, badge, table row, chart, structured field, or inline spec) \u2014 NEVER buried in prose. The grader scores the rendered image, so each required item must be individually visible and unambiguous.\n\nGEOMETRY \u2014 THE #1 FAILURE MODE (avoid overflow/clipping)\nGeometry errors are what cost the most points. The render area is fixed and content gets clipped if you overpack. Apply these rules:\n- Prefer FEWER, WIDER columns over many narrow ones. Do not put long text inside narrow multi-column cards (e.g. 4-column timeline cards whose text wraps and clips). A 4-up grid of cards each containing a badge + a sentence WILL overflow \u2014 use a vertical List or a 2-up grid instead.\n- For timelines, impact stats, and action items, prefer a single-column List or a Card-wrapped List over a SimpleGrid of many cards. A 3-up SimpleGrid of compact stat cards (label + big number + one short badge) is safe; anything denser is risky.\n- Keep text inside cells short \u2014 a few words for badges/labels, one short clause for list items. Move detail into separate fields rather than long wrapping strings.\n- In panes/markdown mode: keep tables narrow (\u22644 columns), keep cell contents short, and limit the number of rows so the tile does not overflow. Avoid stacking many heavy elements (big heading + progress bar + multi-row table + threshold note) in one tile \u2014 split or trim.\n- In vegalite panes: set \"width\":\"container\", give y-scales explicit domains, and keep the number of data points modest so axis labels don't crowd.\n\nCOMPREHENSION \u2014 MAKE EVERY REQUIRED ITEM EXPLICIT\n- Re-read the Use case and treat each listed item as a mandatory labeled section. For an incident: status/severity header (e.g. SEV1, ACTIVE/MITIGATING), IC and roles, a one-line impact summary, impact stats (affected users, error rate, duration with thresholds), a timeline from detect\u2192mitigate\u2192resolve, and follow-up/action items with owners. For an observability dashboard: SLO panel, latency percentile chart (p50/p95/p99), top-errors table, throughput chart.\n- Give each section a clear Title/header so the grader can locate it.\n- Use status color/emoji conventions consistently and include a short legend (\ud83d\udd34 active/breach \u00b7 \ud83d\udfe1 degraded/in-progress \u00b7 \ud83d\udfe2 resolved \u00b7 \u2b1c/\u26aa pending).\n- Include concrete reference values (thresholds, SLO targets, timestamps, owners) as their own inline fields so the reader can act without reading prose.\n- Comprehension scores plateau even on clean layouts (~0.71 for incidents), so do NOT chase comprehension by cramming more content \u2014 that triggers geometry penalties. Once every required item is present and labeled, STOP adding; protect geometry instead.\n\nSTRATEGY SUMMARY\n1. Parse the required items from the Use case.\n2. Map each to its own labeled element.\n3. Choose the LEAST dense layout that still shows everything (single-column lists, 2\u20133 wide cards max, narrow short tables).\n4. Keep every text string short to prevent wrapping/clipping.\n5. Add a legend and explicit thresholds/values.\n6. Verify nothing would overflow before finalizing.\n\nOutput only the JSON for the requested mode.", + "tail_service-health": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'service-health'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each service tile is colored by health: healthy=green, degraded=amber, down=red\n- Each service tile shows an inline sparkline (latency/throughput trend)\n- Each tile shows a current SLI value (e.g. 'p95 214ms') in a meta line\n- Each tile shows the service role/purpose in a sub line\n- Edges connect services in dependency order, colored to match the target's health\n- A legend maps the health colors to healthy/degraded/down\n- No tiles overlap and edges do not pass through unrelated tiles\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_metrics-dashboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'metrics-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A row of stat cards each shows a dimmed label and a large value\n- Stat cards carry status badges (color-coded) where relevant\n- An embedded trend chart (VegaLite) is present inside a card\n- The dashboard has a clear title heading\n- The embedded chart fits its card width (not bleeding outside)\n- Content is arranged in a clean top-down stack (title, stats row, chart)\n- All stat values and labels are legible and aligned\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_task-tracker": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'task-tracker'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A progress ring shows overall completion percentage\n- Stat cards summarize counts by status (e.g. done / in-progress / todo / blocked)\n- A task table lists tasks with their status, owner, and other columns\n- A blocker alert highlights at least one blocked item\n- Task statuses are color-coded consistently across the ring, cards, and table\n- The layout reads top-down as a report (ring/stats, then table, then alert)\n- The tracker has a clear title naming the sprint/release\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_sre-incident": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sre-incident'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A header states the incident severity, status, and a one-line summary\n- A timeline of incident events (detect, escalate, mitigate, resolve) is shown\n- Impact metrics (affected users, error rate, duration) are shown as stats\n- Severity/status is color-coded (e.g. red for active, green for resolved)\n- Follow-up action items or next steps are listed\n- An incident commander/owner is named\n- The page has a clear incident title\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_observability-dashboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'observability-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Panes are arranged as an independent-tile grid (a 2x2 mission-control board)\n- An SLO panel with ring progress and a status badge is present\n- A latency chart (e.g. p50/p95/p99) pane is present\n- A top-errors table and/or alert pane is present\n- A throughput chart pane is present\n- The grid mixes component panels and chart panels\n- Each chart pane has a title and axis titles\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_task-progress": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'task-progress'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- a clear overall completion indicator is shown (e.g. '9 / 49 done' or a % with the denominator), not just a vague status\n- progress is broken down by phase/workstream/category \u2014 multiple sub-progresses, not a single number\n- counts of items by state are visible: done / in-progress / blocked / to-do\n- a table or list of work items, EACH with owner + status + a concrete detail (ETA, result, or note) \u2014 real context per row, not bare titles\n- blockers / risks / at-risk items are explicitly called out (e.g. an alert), or stated as none\n- a timeline/log of what RECENTLY happened (temporal context: last completed, when), not just a static snapshot\n- the next actions / upcoming items are shown so the reader knows what happens next\n- information-dense yet scannable \u2014 packs status + context into the space, not a sparse board with one stat; still legible, nothing clipped\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_dashboard/report.md b/scripts/experiments/gepa-flowchart/overnight/topo3_dashboard/report.md new file mode 100644 index 0000000..72e6836 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_dashboard/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: service-health, metrics-dashboard, task-tracker, sre-incident, observability-dashboard, task-progress +shared skills: artifact_note, board_layout, dashboard_grid +shared skills CHANGED by GEPA: artifact_note, board_layout + +**Seed 0.6320 -> Best 0.8231 (+0.1911)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_graph/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo3_graph/best_topology_skills.json new file mode 100644 index 0000000..d2e1444 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_graph/best_topology_skills.json @@ -0,0 +1,34 @@ +{ + "graph_entity_lanes": "GRAPH LAYOUT (entity/relationship): each entity is a type:\"entity\" node with its attributes in data.fields[] (one {name,type,key?} per row, mark PK/FK) and an explicit width/height \u2014 NEVER cram fields into data.label (that makes one giant overlapping box). Arrange entities in columns; draw each relationship as a labeled edge with cardinality (1, 0..1, 1..*) and an arrowhead. Every entity participates in at least one relationship.", + "graph_process_spine": "GRAPH LAYOUT (process flow): keep the happy path on one straight top-to-bottom spine; send branch/exception steps to the side with edges routed AROUND \u2014 never through \u2014 unrelated nodes. ~5-10 nodes. Label every edge with its trigger/condition and give it a visible arrowhead; color nodes by role via type:\"change\"+status and add a legend.", + "graph_zoned_tiers": "GRAPH LAYOUT (zoned architecture): group nodes into labeled zones/tiers via groups[]; show the external entry point and the request/data flow as directed edges between components; distinguish managed services from compute; add a legend. Keep zones from overlapping and route edges around unrelated nodes.", + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_architecture-zones": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'architecture-zones'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Nodes are clustered into clearly labeled horizontal zone bands (e.g. Edge / Frontend / Services / Data) stacked top-to-bottom, not a flat scatter\n- The overall flow direction reads top-to-bottom, with edges connecting only adjacent tiers (no edge visibly skips over a middle band)\n- Each zone band has its own background tint/color distinguishing it from neighboring bands\n- Nodes are color-coded by role/status (not all plain grey) with each node showing a label plus a secondary sub line\n- Edge arrows are colored (not all grey) and visually distinguishable by purpose\n- A legend/key panel maps colors to their meaning\n- No node boxes overlap and no edge line passes through an unrelated node box\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_swimlane-process": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'swimlane-process'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The diagram is divided into parallel full-length lanes, each labeled with one actor/role\n- Each process step sits inside the lane of the actor who performs it\n- Cross-lane edges visibly hop between lanes at each hand-off between actors\n- The main happy path is colored (e.g. green) and visually distinct from any error/retry path\n- A failure or retry path is shown as a dashed/red edge distinct from the happy path\n- A legend explains the path colors (happy vs error/async)\n- Each step node carries a short label plus a sub/meta detail, not a bare single word\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_user-journey": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'user-journey'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Nodes progress through semantic tones: a blue/info entry, amber/active steps, and a green/done success outcome\n- At least one error/failure branch is shown with a red node and a dashed edge\n- The journey reads top-to-bottom (TB direction)\n- Steps carry inline glyph icons appropriate to the action (not bare text)\n- Each step shows a sub line (e.g. screen, endpoint, or note) under its label\n- A legend maps the tone colors to journey states (entry/step/success/failure)\n- Each edge color matches the tone of the node it points to\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_state-machine": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'state-machine'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each distinct state is a clearly labeled node\n- Every transition edge carries a label naming the triggering event/action\n- Every transition edge has a visible arrowhead showing direction\n- Terminal/final state(s) are visually distinguished (e.g. green/done tone)\n- An initial/entry state is identifiable as the starting point\n- The diagram is laid out top-to-bottom\n- No transition edge passes through an unrelated state node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_er-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'er-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each table/entity is a titled box listing its fields as rows\n- Fields show their data types and key fields (PK/FK) are marked\n- Edges between entities are labeled with cardinality (e.g. 1..*, places, has)\n- Relationship edges have visible arrowheads/markers\n- The schema is laid out top-to-bottom\n- Entity boxes do not overlap and edges do not cross through unrelated entity boxes\n- Every entity participates in at least one labeled relationship (no orphan tables)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_class-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'class-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each class is a titled box showing attribute rows and method() rows\n- Method members are visibly distinguished from attributes (e.g. trailing parentheses)\n- Inheritance is drawn with parents above children (BT orientation)\n- Edges show inheritance/association between classes with arrowheads\n- Class boxes do not overlap and edges do not cross through unrelated boxes\n- The parent-child hierarchy is readable at a glance (a clear tree, not a tangle)\n- Every class connects to at least one other (no orphan class)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_event-driven": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'event-driven'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Producers fan in to a central broker/topic node which then fans out to consumers\n- A clearly identified broker/topic/queue node sits between producers and consumers\n- Stream/queue edges are visually marked as flowing (animated dash) rather than static plain lines\n- A dead-letter/error path is shown with a dashed red edge to a DLQ node\n- The topology is laid out top-to-bottom\n- Producer, broker, and consumer nodes are color-coded by role (not all grey)\n- No node boxes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_query-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'query-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The plan is a top-down tree of SQL operator nodes (scans, joins, aggregates)\n- Operators are colored by cost: cheap=green, moderate=amber, hot=red\n- The most expensive operator (hot scan/join) is flagged with an icon or red highlight\n- Each operator shows rows and/or cost figures in a meta/sub line\n- Operators show the relevant predicate/join key/grouping detail\n- The operator tree reads top-to-bottom\n- A legend maps the cost colors to their meaning\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_data-lineage": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'data-lineage'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Nodes are grouped into labeled tier zones (Sources / Staging / Marts / Consumers)\n- Data flows in one consistent direction from sources toward consumers\n- Datasets show a freshness/SLA indicator in a meta line\n- At least one stale-past-SLA edge is shown dashed/red distinct from fresh green edges\n- Fresh data-flow edges are colored green\n- A legend distinguishes fresh vs stale edges\n- No nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_recursion-tree": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'recursion-tree'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Recursive calls form a top-down tree, each call a node\n- Base-case leaf nodes are colored green/done\n- Each call node shows its return value in a sub line\n- Memoized/cache-hit calls are visually distinct (e.g. blue/info) with a 'cached' edge label, if applicable\n- The tree reads top-to-bottom from the root call\n- No call nodes overlap and edges do not pass through unrelated nodes\n- A legend maps node colors to base case / internal call / memoized\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_critical-path": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'critical-path'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Tasks are laid out left-to-right reflecting the time/schedule axis\n- The critical path tasks are highlighted red with thick edges and a 'bolt'-style icon (not read as 'failed')\n- Tasks with slack are shown neutral/grey with dashed edges, distinct from the critical chain\n- Each task shows its duration (e.g. '6d') in a meta line\n- Each task shows its owner (and optionally slack) in a sub line\n- A two-row legend distinguishes critical path (red) from has-slack (grey-dash)\n- No task nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_pr-review": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'pr-review'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Code-entity nodes are colored by diff status: added=green, removed=red(strikethrough), modified=amber, unchanged=neutral\n- Each node carries a kind eyebrow tag (module/function/class/file)\n- Each node shows its file path in a sub line\n- Changed nodes show a LOC delta (e.g. +42) in a meta line\n- Edges to removed entities are dashed/red, distinct from other edges\n- A legend maps the diff-status colors to added/removed/modified/unchanged\n- The change graph is laid out top-to-bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_okr-tree": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'okr-tree'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The tree shows three levels: Objective at top, Key Results below, Initiatives at the bottom\n- Nodes carry kind tags (objective / key result / initiative)\n- Objective and key-result nodes show a percent/progress figure in a meta line\n- Nodes are colored by status: on-track=green, at-risk=amber, behind=red\n- Edges are colored to match their target node's status\n- A legend maps the colors to on-track / at-risk / behind\n- The tree is laid out top-to-bottom from objective to initiatives\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_build-pipeline": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'build-pipeline'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each pipeline stage (build, test, scan, deploy) is a node\n- Stages are colored by status: passed=green, running=amber, failed=red\n- Stages connect in execution order with arrowheads\n- Stages show a duration or step detail in a meta/sub line\n- If a stage fails, it is clearly red and the downstream stages reflect being blocked/skipped\n- A legend maps status colors to passed/running/failed\n- No stage nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_k8s-topology": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'k8s-topology'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Resources are grouped into labeled namespace/cluster zones\n- Distinct resource kinds (Ingress, Service, Deployment, Pod, ConfigMap, Secret, Node) are shown as labeled nodes\n- The ingress\u2192service\u2192deployment\u2192pods traffic path is traceable via edges\n- Edges are colored by purpose (traffic / manages / config-mount / scheduled-on) and distinguishable\n- Pods show a status (e.g. Running) and are color-coded\n- A legend maps edge colors to their purpose\n- No resource nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_security-threat-model": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'security-threat-model'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- An attacker goal / protected asset is the root node\n- Attack vectors (kind=vector) branch from the asset with descriptive subs\n- Each vector connects to one or more mitigation nodes (kind=mitigation)\n- Nodes/edges are colored by mitigation state: mitigated=green, partial=amber, open=red\n- An unmitigated gap is shown with a dashed red edge or an explicit 'gap' label\n- A legend maps colors to mitigated / partial / open\n- The threat tree is laid out top-to-bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_sequence-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sequence-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each participant has a labeled header box with a vertical lifeline beneath it\n- Messages are drawn as horizontal arrows between lifelines in vertical time order (top = earliest)\n- Request/call arrows are visually distinct from response/return arrows\n- Every message arrow is labeled with the call/response it represents\n- Every declared participant sends or receives at least one message\n- Lifelines are evenly spaced and vertical, with no overlapping arrows\n- Each arrow clearly points from sender to receiver\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_call-hierarchy": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'call-hierarchy'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A root function is at the top of the hierarchy\n- Called functions are nested as children under their callers\n- Each function node shows its file/location\n- If a before/after diff, nodes are colored by change (added/removed/modified/unchanged)\n- Changed nodes carry a note explaining the change\n- Relevant nodes show meta info (e.g. timing or call count)\n- The hierarchy reads cleanly as a tree (clear parent-child nesting, no tangle)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_c4-architecture": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'c4-architecture'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Three flow panes are stacked top-to-bottom as increasing zoom levels\n- The top pane is a System Context view (the system as a box with users and external systems)\n- The middle pane is a Container view (apps/services/data stores inside the system)\n- The bottom pane is a Component view (building blocks inside one container)\n- Edges are labeled with the interaction and protocol\n- Each level distinguishes person / this-system / external (color or legend)\n- Each level is a small, clean diagram (roughly 7 nodes or fewer)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_ml-pipeline": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'ml-pipeline'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each pipeline stage is a clearly labeled node (ingest, features, train, eval, deploy)\n- Stages are connected in execution order with directed arrows\n- An eval/validation gate before deployment is explicitly shown\n- Data/model artifacts passed between stages are labeled\n- A branch on eval pass/fail (deploy vs retrain) is shown\n- The pipeline is laid out top-to-bottom\n- No edge passes through an unrelated stage node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_terraform-resource-graph": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'terraform-resource-graph'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each resource/data source is a labeled node showing its type\n- Directed edges show resource dependencies (what depends on what)\n- The provider/root config is identifiable as a source node\n- The graph implies a clear creation order (sources before dependents)\n- Resources are visually distinguished by type or tone\n- The graph is laid out top-to-bottom\n- No edge passes through an unrelated resource node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_cloud-architecture": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'cloud-architecture'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Resources are grouped into labeled zones (VPC, public/private subnets, tiers)\n- Each resource is a labeled node showing its service/type\n- Directed edges show the request/traffic flow between components\n- The internet/user entry point and the cloud boundary are shown\n- Managed services (DB, queue, cache) are distinguished from compute\n- A legend or tones explain the zone/resource types\n- Nodes and zones do not overlap; edges avoid unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_network-topology": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'network-topology'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- VPC(s) and their public/private subnets are shown as zones\n- Internet/NAT gateways and their placement are shown\n- Routing between subnets/gateways is indicated by edges or labels\n- Security groups / firewall rules are represented\n- Subnets/VPCs are labeled with their CIDR ranges\n- Edges show the allowed traffic direction\n- Zones and nodes do not overlap; edges avoid unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_terraform-module-tree": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'terraform-module-tree'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The root module is the top of the hierarchy\n- Child modules are nested under their calling module\n- Each module lists the resources it creates\n- Module inputs/outputs (or key variables) are indicated\n- Reused/shared modules are identifiable\n- The composition hierarchy reads top-down clearly\n- Title names the root configuration\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_user-flow": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'user-flow'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each node represents a screen/state of the app\n- Edges are labeled with the user action/trigger causing the transition\n- A clear entry/start screen is identifiable\n- Decision points (e.g. logged-in vs not, success vs error) branch\n- A primary happy-path is distinguishable\n- End/goal states are marked\n- Laid out top-to-bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_sitemap": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sitemap'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The home/root screen is the top of the hierarchy\n- Pages/sections nest under their parent\n- Navigation groups/sections are clear\n- Multiple depth levels are shown (primary nav \u2192 sub-pages)\n- Each screen/page is clearly labeled\n- Laid out top-to-bottom as a tree\n- No edge passes through an unrelated page node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_generic-flow": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'generic-flow'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A consistent layout direction (top-down or left-right) suited to the content\n- Every node has a concise, meaningful label\n- Edges are labeled wherever the relationship is not obvious\n- Nodes are color-coded by role/type, with a legend when colored\n- Appropriate level of detail \u2014 not too sparse, not overcrowded\n- No node overlaps and no edge crosses through an unrelated node\n- The diagram fully covers the requested topic with no obvious gaps\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_graph/report.md b/scripts/experiments/gepa-flowchart/overnight/topo3_graph/report.md new file mode 100644 index 0000000..7d44013 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_graph/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: architecture-zones, swimlane-process, user-journey, state-machine, er-diagram, class-diagram, event-driven, query-plan, data-lineage, recursion-tree, critical-path, pr-review, okr-tree, build-pipeline, k8s-topology, security-threat-model, sequence-diagram, call-hierarchy, c4-architecture, ml-pipeline, terraform-resource-graph, cloud-architecture, network-topology, terraform-module-tree, user-flow, sitemap, generic-flow +shared skills: artifact_note, board_layout, graph_entity_lanes, graph_process_spine, graph_zoned_tiers +shared skills CHANGED by GEPA: (none) + +**Seed 0.4337 -> Best 0.4337 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_report/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo3_report/best_topology_skills.json new file mode 100644 index 0000000..bf3ea31 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_report/best_topology_skills.json @@ -0,0 +1,26 @@ +{ + "report_rows": "REPORT LAYOUT: a cohesive top-down document read in order (rows), not a peer grid. Title each section and make sections visually distinct (heading + spacing). Embed REAL artifacts (a chart spec, a table, a map, an image) rather than describing them in prose.", + "board_layout": "You design single-screen \"board plans\" \u2014 structured visual layouts described in text. \n\nINPUT FORMAT: You receive Journey (a board type + form factor like \"component\" or \"panes\"), Use case (what the board depicts, often listing the specific elements it must contain), Reader (the audience), and Goal (what the reader should achieve).\n\nYOUR OUTPUT: A board plan with a layout description, a title, and the concrete content of every section/pane.\n\n=== CORE RULES ===\n\n1. EXTRACT THE RUBRIC FROM THE USE CASE. The Use case sentence enumerates the required elements \u2014 each named thing is a separately-scored rubric item. Render EVERY one explicitly, concretely, and unmistakably. Examples of how use-case phrases map to required elements:\n - \"located root cause\" / debugging \u2192 a ROOT-CAUSE ALERT element (dedicated, visually dominant panel) AND an ACTIVE/CURRENT-STEP marker on the final/active step.\n - \"loss-curve chart\" / \"quantitative\" / any chart \u2192 an actual chart with REAL plotted data points (give the full data series), axis labels, and a title stating the takeaway.\n - \"the mechanism\" / \"how it works\" \u2192 a clear step-by-step MENTAL-MODEL FLOW with labeled nodes and directional arrows.\n - \"image\" \u2192 an IMAGE element that is genuinely present and prominent (large hero, not a thumbnail) with a concrete filename and described content.\n - \"ingredients\" \u2192 a complete itemized INGREDIENTS LIST with quantities.\n - \"steps\" \u2192 an explicitly NUMBERED ordered step list.\n - \"nutrition note\" / \"tips\" \u2192 a NUTRITION or TIPS element with real values.\n - \"misconception\" \u2192 a dedicated MISCONCEPTION ALERT box (red/amber, clearly flagged).\n If the use case names it, it is required. Do not merely mention it in passing \u2014 give it its own clearly-labeled, self-contained section so it scores.\n\n2. FIT ONE SCREEN \u2014 NEVER OVERFLOW OR CLIP. This is the most common failure. Aggressively constrain total content volume:\n - Cap the number of items. Long timelines/lists overflow: keep step lists to ~6\u20138 items max; merge or summarize rather than listing 13 steps.\n - Keep each element terse \u2014 short labels, one dim detail line max, no long paragraphs.\n - Do NOT add optional flourishes (legends, badges, sub-captions, \"flow check\" notes, active-step emphasis paragraphs) that consume vertical space; spend the budget on required rubric elements instead.\n - Prefer compact tables and tight grids over tall stacked prose.\n - Before finalizing, mentally check: does every required element plus its content fit one screen without scrolling? If not, shorten content \u2014 but NEVER drop a required element.\n\n3. LAYOUT & TITLE:\n - Lay out top-to-bottom and state the read direction explicitly.\n - Title names the subject AND its scope.\n - Multi-section boards use titled panes: rows for a report read in order; a balanced 2\u00d72 grid for a dashboard of peer tiles. Title each pane.\n\n4. CONTENT QUALITY:\n - Use real, specific values \u2014 never placeholders. Charts get full real data series; tables get real numbers; flows get real example values.\n - Make required alert/marker elements visually dominant and unambiguous (dedicated panel, bordered box, badge) so they are clearly realized, not implied.\n\nPRIORITY ORDER when trading off: (1) include every required element, (2) keep it on one screen, (3) make each element concrete and high-quality. Comprehension and rubric coverage matter more than extra decorative detail \u2014 cut decoration first, content second, required elements never.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_pr-review-summary": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'pr-review-summary'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A title plus a one-line statement of the PR's goal is present\n- A two-column Before/After comparison shows the problem vs the fix, each with a colored badge and a list\n- A timeline describes how the new flow runs after the change\n- A green alert states the validation/test result\n- An FAQ section answers anticipated reviewer questions\n- An outbound link to the actual pull request is present (e.g. 'View PR #128 \u2192')\n- The summary reads as a coherent narrative (the why/how), not a bare graph\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_risk-matrix": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'risk-matrix'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A grid with Impact on one axis and Likelihood on the other\n- Each cell is tinted from green (low) through amber to red (high) by likelihood \u00d7 impact\n- Individual risks appear as small labeled chips/badges placed in the appropriate cell\n- Both axes are labeled (Impact High\u2192Low, Likelihood Low\u2192High)\n- A swatch legend explains the low/medium/high severity colors\n- The severity gradient reads along the diagonal (low corner to high corner)\n- Cells show or imply the numeric score so color isn't the only channel\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_raci-matrix": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'raci-matrix'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Rows are deliverables/tasks and columns are roles\n- Each filled cell holds a single R/A/C/I letter badge\n- Each row has exactly one Accountable (A) cell\n- R/A/C/I are visually distinct (A prominent, R filled, C outline, I light)\n- A legend spells out what R, A, C, and I mean\n- Cell badges are centered for a clean grid read\n- The matrix has a clear title\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_task-checklist": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'task-checklist'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A checklist with multiple labeled items is shown\n- Items show clear checked vs unchecked states (visible tick boxes)\n- The list includes both completed and pending items\n- Items are grouped (e.g. an agent-driven section and a human sign-off section) where appropriate\n- The checklist has a clear title naming the process\n- Overall progress is evident from the ratio of checked to unchecked items\n- Each item's label is legible and describes a concrete step\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_stacktrace": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'stacktrace'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The trace is rendered as monospace text lines preserving indentation\n- Lines are colored by role: the culprit/error line red, your code blue, library/runtime dimmed\n- Call depth is shown via leading-space indentation per line\n- An alert calls out the likely cause/culprit frame\n- A badge legend explains the line colors\n- Each frame shows the function and file/location legibly\n- The stack reads top-to-bottom as a linear call path, not a graph\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_flame-graph": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'flame-graph'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Frames are stacked by call depth (deeper calls below their parent)\n- Each frame's width represents its share of CPU samples/time\n- Child frames sit within the horizontal x-range of their parent\n- Frames are colored on a warm palette by self-time (hotter = more self-time)\n- Frames show a (possibly truncated) function name and time\n- A caption notes that width = time\n- The widest (hot-path) frame stack is identifiable\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_agent-trace": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'agent-trace'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Agent steps are shown as a vertical timeline of items\n- Items are colored by kind (think/tool-call/observation/result/error)\n- Each step carries a small kind badge\n- A window header shows a badge with the step range (e.g. 'steps 6-19 of 19')\n- Each step has a dimmed detail line describing what happened\n- A red alert near the end names the root cause/result\n- The latest/active step is visually emphasized\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_diy-project-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'diy-project-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A title plus badges (e.g. difficulty, time, cost) head the plan\n- A materials/tools table lists what's needed\n- An interactive checklist of build steps is present\n- At least one video tutorial reference card with a thumbnail is shown\n- A budget chart (VegaLite) breaks down the cost\n- The plan reads as a cohesive top-down document, not a grid of peers\n- A map or list of where to buy materials is included\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_map-routes": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'map-routes'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- An interactive map with tiles is shown\n- Multiple labeled markers (an origin and destinations) are placed on the map\n- Route lines/polylines connect the origin to destinations\n- Routes carry labels (e.g. distance/time)\n- Markers are color-coded (e.g. origin vs destinations)\n- A legend table beside the map keys the markers/routes\n- The map view includes all markers (none cut off)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_recipe-display": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'recipe-display'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A recipe title plus a hero image of the dish is shown\n- Badges show prep time, cook time, servings, and difficulty\n- An ingredients list with quantities is present\n- Cooking steps are shown as an ordered/numbered list or timeline\n- A nutrition summary or tips section is included\n- The recipe reads as a cohesive top-down card (image, meta, ingredients, steps)\n- The dish image renders prominently (not a tiny thumbnail)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_algorithm-walkthrough": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'algorithm-walkthrough'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A markdown/code pane shows the annotated algorithm code\n- A component pane shows a step-by-step trace table of the state at each step\n- A vegalite pane shows a complexity curve for the algorithm\n- The three panes (code / trace / complexity) are visually distinct and labeled\n- The trace table rows are ordered by execution step\n- The complexity chart has a title and axis titles\n- Together the panes explain how the code works, not just show it\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_explainer": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'explainer'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The top pane opens with a plain-language hook/surprising claim, not the abstraction\n- A flow pane shows the reasoning chain as color-coded nodes (postulate\u2192reasoning\u2192conclusion)\n- A vegalite pane shows the key quantitative relationship with a title saying what to notice\n- A component pane gives concrete numbers and a real-world anchor\n- A common misconception is addressed (an alert)\n- The panes are stacked top-down as a cohesive explainer (hook\u2192mechanism\u2192chart\u2192grounding)\n- Each layer is short, building intuition over symbols\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_debug-snapshot": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'debug-snapshot'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Panes are stacked top-down (error/state above the call stack)\n- A component pane shows an alert with the error message\n- A table of variable values at the failure point is present\n- A flow pane shows the call stack at the moment of failure (top-down)\n- The culprit frame in the call stack is visually highlighted\n- Each pane is clearly labeled (error/state vs call stack)\n- Together the panes capture one moment of failure coherently\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_eval-scorecard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'eval-scorecard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each evaluation criterion is its own row\n- Each criterion shows a pass/partial/fail verdict, color-coded\n- Each criterion shows a numeric score or weight\n- An overall aggregate score is prominently shown\n- Failing criteria are visually highlighted\n- Each criterion has a short note or evidence string\n- Title names the system/model under evaluation\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_terraform-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'terraform-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Resources are grouped by action (create / update / replace / destroy)\n- Each action is color-coded (green add, yellow change, red destroy)\n- Each resource shows its address/type (e.g. aws_instance.web)\n- A summary shows counts: N to add, M to change, K to destroy\n- Notable changed attributes are listed per resource\n- Destructive (replace/destroy) changes are visually warned\n- Title names the environment/workspace being planned\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_travel-itinerary": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'travel-itinerary'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each day of the trip is its own clearly labeled section/pane\n- Activities have times/order within each day\n- A map pane shows the destinations or route\n- A budget/cost element (chart or table) is included\n- Transport and accommodation logistics are noted\n- Must-see highlights are flagged\n- Title names the trip (destination + duration)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_weekend-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'weekend-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Saturday and Sunday are clearly separated\n- Activities are placed in time blocks (morning/afternoon/evening or hours)\n- A mix of activity types (chores, leisure, social) is shown\n- A checklist of to-dos with checkboxes is included\n- Priorities or must-dos are flagged\n- Realistic buffers, meals and rest are included\n- Title names the weekend\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_meal-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'meal-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A grid of days x meals (breakfast/lunch/dinner)\n- Each slot names a specific dish\n- A consolidated grocery list with checkboxes\n- Variety and nutritional balance across the week\n- Prep or leftover notes are included\n- Dietary constraints are reflected or tagged\n- Title names the plan/week and any diet\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_recipe": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'recipe'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A clear ingredients list with quantities\n- Numbered preparation steps in order\n- Prep/cook time and number of servings are shown\n- Difficulty or skill level is indicated\n- Nutrition info or chef tips are included\n- A visual/photo of the dish is included\n- Title names the dish\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_generic-component": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'generic-component'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Information is organized with a clear visual hierarchy (headings/sections)\n- Uses appropriate UI components for the data (cards/tables/lists/badges)\n- Sections are titled and scannable\n- Covers the requested information completely with concrete details\n- Related items are visually grouped\n- Dense but legible \u2014 good use of space, neither cramped nor empty\n- Key items/values are emphasized (badges, color)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_generic-panes": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'generic-panes'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Panes form a coherent whole (rows for a report, grid for a dashboard)\n- Each pane uses the right type for its content\n- Each pane is titled\n- There is a clear reading order or grouping\n- Together the panes tell a complete story about the topic\n- Panes are balanced \u2014 none empty or overflowing\n- An overall title frames the board\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_report/report.md b/scripts/experiments/gepa-flowchart/overnight/topo3_report/report.md new file mode 100644 index 0000000..127a745 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_report/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: pr-review-summary, risk-matrix, raci-matrix, task-checklist, stacktrace, flame-graph, agent-trace, diy-project-plan, map-routes, recipe-display, algorithm-walkthrough, explainer, debug-snapshot, eval-scorecard, terraform-plan, travel-itinerary, weekend-plan, meal-plan, recipe, generic-component, generic-panes +shared skills: artifact_note, board_layout, report_rows +shared skills CHANGED by GEPA: board_layout + +**Seed 0.5702 -> Best 0.6084 (+0.0382)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_screen/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo3_screen/best_topology_skills.json new file mode 100644 index 0000000..5919b1f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_screen/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "screen_frame": "TASK: You produce a BOARD PLAN (a written spec) for a UI mockup that another tool will render visually. Read the inputs (Journey, Use case, Reader, Goal) and output a clean, structured plan that renders as a real, polished app screen \u2014 NOT a diagram.\n\nINPUT FORMAT: You receive Journey (the journey type), Use case, Reader, and Goal.\n\nTwo journey types:\n- \"<anything> (component)\" \u2192 ONE single mobile frame.\n- \"<anything> (panes)\" / \"screen-set\" \u2192 MULTIPLE peer mobile frames in one titled set.\nNOTE: Many use cases (design-system, wireframe, form-input kit, component-library showcase) arrive tagged \"(component)\". DO NOT silently convert a \"(component)\" job into a multi-frame set to dodge the budget \u2014 a single-frame request must render as ONE frame. Splitting it hurts comprehension. Only use multiple frames when the journey is explicitly a set/panes/flow.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#0 PRIORITY \u2014 MAXIMIZE COMPREHENSION (the #1 scoring failure)\nComprehension has been the lowest-scoring axis (\u22480.2\u20130.3). The renderer and grader must be able to UNAMBIGUOUSLY map your words to concrete visual elements. To raise comprehension:\n- Be CONCRETE and LITERAL. Every element gets: a TYPE (card / input / button / chip / row / swatch / toggle), an exact POSITION (which section, what order), a SIZE/dimension hint, and literal TEXT/value it contains.\n- Avoid abstraction, meta-commentary, and rubric-justification asides inside the frame body (e.g. don't write \"the rubric demands 7 groups\"). Keep planning rationale only in the header/footer notes, never mixed into frame steps.\n- Use simple, declarative one-line element descriptions. Prefer \"Button, full-width, indigo fill, white bold text 'Continue'\" over prose paragraphs.\n- Give explicit X/Y or top\u2192bottom ordinal placement AND approximate pixel heights so the renderer can lay out without guessing (e.g. \"Header: 56px tall\", \"Card: ~120px\", \"CTA: 52px at y=780\").\n- State the frame's exact canvas size (390\u00d7844) and that all content sits inside it.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#1 PRIORITY \u2014 RESPECT THE MOBILE BUDGET (prevents overflow/clipping)\nContent overflow remains a real failure even when you CLAIM \"no overflow\" \u2014 so you must actually CUT, not just assert. A real phone frame (~390\u00d7844) fits far LESS than you think. BE RUTHLESS \u2014 underfill rather than overfill. Per frame, the TOTAL of all content must stay within:\n- Status bar (~24px) + header (1 line title, ~56px) \u2014 counts toward budget.\n- THEN choose ONE of these content shapes, not several:\n \u2022 2 cards max (3 only if each is genuinely tiny, \u22642 inner lines), OR\n \u2022 2\u20133 list rows max, OR\n \u2022 1 short form (3 input fields max).\n- Plus ONE primary CTA (~52px).\n- VERIFY THE MATH: sum your block heights; they must total well under 844px (aim \u2264760px of content so there's breathing room). State this sum in your closing note.\nHARD CAPS: no more than ~6 total content blocks in a frame (lower than before \u2014 be conservative); no card with more than 4 inner lines; NEVER use multi-line \"Order summary\" / itemized receipts / long dotted-leader tables. If detail is implied (checkout, summary), SUMMARIZE on one row (e.g. \"2 items \u00b7 $164.93\").\nA bottom tab bar is optional; if used it counts as a block.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#2 \u2014 REALISTIC, POLISHED APP LOOK (not a diagram)\n- Always include: a status bar (\"9:41\" + signal/wifi/battery), a top app bar/header with a real screen title, a primary content area, and ONE prominent CTA.\n- Use realistic placeholder content: real-sounding names, emails, prices, copy \u2014 never \"Lorem ipsum\" or \"Label 1\".\n- Consistent rounded styling: cards 12\u201316px radius, inputs 12px radius, buttons 10\u201312px radius; 16px side margins; 12px gaps between cards.\n- Light neutral background (#F5F6F8 or #F2F2F7), white cards, soft shadow, one accent color used sparingly.\n- Provide a font hierarchy note (header bold ~17pt, section labels small uppercase gray ~11pt, body ~14px).\n- EXCEPTION \u2014 wireframes: render greyscale low-fidelity (grey box fills #E4E4E7, 1px #A1A1AA outlines, placeholder bars \u25ac\u25ac\u25ac, \u2715-cross for images), with exactly ONE filled accent element (the CTA). Annotation pills with leader lines go OFF in the right margin, never crowding the frame.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#3 \u2014 PRIMARY CTA MUST BE UNMISTAKABLE\n- Exactly ONE dominant CTA per frame: full-width, solid accent-color fill, bold white text, rounded, ~52px tall.\n- Make it the visually largest/boldest interactive element. Secondary actions must be plain text links, clearly subordinate (smaller, no fill).\n- Place it prominently (sticky bottom bar or near bottom of content).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#4 \u2014 DESIGN-SYSTEM / COMPONENT-LIBRARY SPECIFICS (rubric: component_variants, spacing_or_grid)\nThese tasks keep losing points on component_variants (0.5) and spacing_or_grid (0.5). To fix:\n- COMPONENT_VARIANTS: For any component you document, show it in MULTIPLE DISTINCT STATES/VARIANTS explicitly and visibly \u2014 e.g. button: Default / Hover / Disabled / Outline; input: Empty / Filled / Error (red border + helper) / Success (green + \u2713); badge: Active/Pending/Error. Don't show one example of each component \u2014 show one component across several visibly different states. Label each variant with its name.\n- SPACING_OR_GRID: Include a concrete, VISIBLE spacing/grid spec \u2014 render actual scaled bar previews of spacing tokens (4 / 8 / 16 / 24 / 32px shown as bars of proportional width), state the base grid (e.g. \"8px base \u00b7 16px gutter \u00b7 4pt rhythm\"), and show alignment to a column grid. Make it visual, not just a text list.\n- If a single \"(component)\" frame can't fit both rich variants AND grid without clipping, prioritize a FOCUSED showcase (one component family in many variants) over cramming everything \u2014 and still keep within budget.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#5 \u2014 FOR SCREEN-SETS (only when journey is a set/panes/flow)\n- Give the whole set a clear TITLE BANNER at top (e.g. \"Sign-Up Flow \u2014 Acme App \u00b7 3 Steps\") and a caption above each frame (\"Step 1 of 3 \u00b7 Create Account\").\n- Lay frames LEFT-TO-RIGHT as peer tiles, connected by labeled arrows showing transition direction.\n- Each frame must be DISTINCT and individually realistic \u2014 different content/state per step (each obeying the mobile budget), not near-duplicates. Show state progression (progress dots \u25cf\u25cb\u25cb \u2192 \u25cf\u25cf\u25cb \u2192 \u25cf\u25cf\u25cf, success-green CTA on a final/done screen).\n- Keep shared chrome consistent across frames (same phone outline, status bar, header style, input style, accent color).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\nOUTPUT FORMAT\n- Start with \"# BOARD PLAN: <title>\".\n- State journey type, orientation (single frame top-to-bottom, or set left-to-right), canvas size, and global styling up front.\n- Number the sections of each frame top\u2192bottom, each with TYPE + position + height + literal content.\n- Keep all rationale/justification out of the numbered frame steps; put it in the header note and the closing note.\n- End with: (a) a short hierarchy/consistency note, and (b) an explicit block-height sum confirming content fits (\"blocks sum to ~Npx of 844 \u2014 one screen, no overflow\").\n\nREMEMBER: A sparse, clean, LITERAL frame that a renderer can unambiguously interpret scores far higher than a dense, abstract, or accurate-but-clipped one. When in doubt, cut content and be more concrete.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_app-screen-mockup": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'app-screen-mockup'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Looks like a real polished app screen, not a diagram\n- A header / top app bar / nav is present\n- A clear primary content area (cards/list/feed/form) fills the screen\n- A prominent primary call-to-action button/action is present\n- Clear visual hierarchy via headings, sections and spacing\n- Realistic placeholder content (names, values, labels), not lorem stubs\n- Consistent polished styling (colors, spacing, rounded cards)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_wireframe": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'wireframe'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Low-fidelity look: greyscale boxes/placeholders, minimal color or imagery\n- Layout regions are blocked out (header, sidebar, content, footer)\n- Placeholder text/image blocks stand in for real content\n- Annotations/notes explain regions or interactions\n- Elements align to a clear grid/structure\n- Key UI elements (nav, buttons, inputs, lists) are indicated as boxes\n- Titled with the screen/page name\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_design-system": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'design-system'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A color palette with swatches (primary/secondary/semantic) is shown\n- A typography scale (headings/body) is shown\n- Core components shown in their variants (e.g. button states, badges)\n- Form elements (inputs, toggles, selects) are displayed\n- Spacing/grid or sizing tokens are documented\n- Each element/section is labeled with its name/usage\n- A consistent visual brand across the showcase\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_screen-set": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'screen-set'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Several distinct app screens are shown side by side\n- The screens tell a sequence/flow (step 1 \u2192 2 \u2192 3)\n- Each screen/frame has a title/caption\n- Each frame looks like a real screen, not a diagram\n- Consistent visual style across the frames\n- The flow/transition between frames is indicated\n- The set has an overall title naming the flow\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3_screen/report.md b/scripts/experiments/gepa-flowchart/overnight/topo3_screen/report.md new file mode 100644 index 0000000..29ef9a2 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3_screen/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: app-screen-mockup, wireframe, design-system, screen-set +shared skills: artifact_note, board_layout, screen_frame +shared skills CHANGED by GEPA: screen_frame + +**Seed 0.5816 -> Best 0.7175 (+0.1359)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3b_graph/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo3b_graph/best_topology_skills.json new file mode 100644 index 0000000..a5c13bd --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3b_graph/best_topology_skills.json @@ -0,0 +1,34 @@ +{ + "graph_entity_lanes": "GRAPH LAYOUT (entity/relationship): each entity is a type:\"entity\" node with its attributes in data.fields[] (one {name,type,key?} per row, mark PK/FK) and an explicit width/height \u2014 NEVER cram fields into data.label (that makes one giant overlapping box). Arrange entities in columns; draw each relationship as a labeled edge with cardinality (1, 0..1, 1..*) and an arrowhead. Every entity participates in at least one relationship.", + "graph_process_spine": "GRAPH LAYOUT (process flow): keep the happy path on one straight top-to-bottom spine; send branch/exception steps to the side with edges routed AROUND \u2014 never through \u2014 unrelated nodes. ~5-10 nodes. Label every edge with its trigger/condition and give it a visible arrowhead; color nodes by role via type:\"change\"+status and add a legend.", + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "graph_zoned_tiers": "GRAPH LAYOUT (zoned architecture): group nodes into labeled zones/tiers via groups[]; show the external entry point and the request/data flow as directed edges between components; distinguish managed services from compute; add a legend. Keep zones from overlapping and route edges around unrelated nodes.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_architecture-zones": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'architecture-zones'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Nodes are clustered into clearly labeled horizontal zone bands (e.g. Edge / Frontend / Services / Data) stacked top-to-bottom, not a flat scatter\n- The overall flow direction reads top-to-bottom, with edges connecting only adjacent tiers (no edge visibly skips over a middle band)\n- Each zone band has its own background tint/color distinguishing it from neighboring bands\n- Nodes are color-coded by role/status (not all plain grey) with each node showing a label plus a secondary sub line\n- Edge arrows are colored (not all grey) and visually distinguishable by purpose\n- A legend/key panel maps colors to their meaning\n- No node boxes overlap and no edge line passes through an unrelated node box\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_swimlane-process": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'swimlane-process'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The diagram is divided into parallel full-length lanes, each labeled with one actor/role\n- Each process step sits inside the lane of the actor who performs it\n- Cross-lane edges visibly hop between lanes at each hand-off between actors\n- The main happy path is colored (e.g. green) and visually distinct from any error/retry path\n- A failure or retry path is shown as a dashed/red edge distinct from the happy path\n- A legend explains the path colors (happy vs error/async)\n- Each step node carries a short label plus a sub/meta detail, not a bare single word\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_user-journey": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'user-journey'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Nodes progress through semantic tones: a blue/info entry, amber/active steps, and a green/done success outcome\n- At least one error/failure branch is shown with a red node and a dashed edge\n- The journey reads top-to-bottom (TB direction)\n- Steps carry inline glyph icons appropriate to the action (not bare text)\n- Each step shows a sub line (e.g. screen, endpoint, or note) under its label\n- A legend maps the tone colors to journey states (entry/step/success/failure)\n- Each edge color matches the tone of the node it points to\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_state-machine": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'state-machine'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each distinct state is a clearly labeled node\n- Every transition edge carries a label naming the triggering event/action\n- Every transition edge has a visible arrowhead showing direction\n- Terminal/final state(s) are visually distinguished (e.g. green/done tone)\n- An initial/entry state is identifiable as the starting point\n- The diagram is laid out top-to-bottom\n- No transition edge passes through an unrelated state node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_er-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'er-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each table/entity is a titled box listing its fields as rows\n- Fields show their data types and key fields (PK/FK) are marked\n- Edges between entities are labeled with cardinality (e.g. 1..*, places, has)\n- Relationship edges have visible arrowheads/markers\n- The schema is laid out top-to-bottom\n- Entity boxes do not overlap and edges do not cross through unrelated entity boxes\n- Every entity participates in at least one labeled relationship (no orphan tables)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_class-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'class-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each class is a titled box showing attribute rows and method() rows\n- Method members are visibly distinguished from attributes (e.g. trailing parentheses)\n- Inheritance is drawn with parents above children (BT orientation)\n- Edges show inheritance/association between classes with arrowheads\n- Class boxes do not overlap and edges do not cross through unrelated boxes\n- The parent-child hierarchy is readable at a glance (a clear tree, not a tangle)\n- Every class connects to at least one other (no orphan class)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_event-driven": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'event-driven'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Producers fan in to a central broker/topic node which then fans out to consumers\n- A clearly identified broker/topic/queue node sits between producers and consumers\n- Stream/queue edges are visually marked as flowing (animated dash) rather than static plain lines\n- A dead-letter/error path is shown with a dashed red edge to a DLQ node\n- The topology is laid out top-to-bottom\n- Producer, broker, and consumer nodes are color-coded by role (not all grey)\n- No node boxes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_query-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'query-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The plan is a top-down tree of SQL operator nodes (scans, joins, aggregates)\n- Operators are colored by cost: cheap=green, moderate=amber, hot=red\n- The most expensive operator (hot scan/join) is flagged with an icon or red highlight\n- Each operator shows rows and/or cost figures in a meta/sub line\n- Operators show the relevant predicate/join key/grouping detail\n- The operator tree reads top-to-bottom\n- A legend maps the cost colors to their meaning\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_data-lineage": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'data-lineage'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Nodes are grouped into labeled tier zones (Sources / Staging / Marts / Consumers)\n- Data flows in one consistent direction from sources toward consumers\n- Datasets show a freshness/SLA indicator in a meta line\n- At least one stale-past-SLA edge is shown dashed/red distinct from fresh green edges\n- Fresh data-flow edges are colored green\n- A legend distinguishes fresh vs stale edges\n- No nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_recursion-tree": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'recursion-tree'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Recursive calls form a top-down tree, each call a node\n- Base-case leaf nodes are colored green/done\n- Each call node shows its return value in a sub line\n- Memoized/cache-hit calls are visually distinct (e.g. blue/info) with a 'cached' edge label, if applicable\n- The tree reads top-to-bottom from the root call\n- No call nodes overlap and edges do not pass through unrelated nodes\n- A legend maps node colors to base case / internal call / memoized\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_critical-path": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'critical-path'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Tasks are laid out left-to-right reflecting the time/schedule axis\n- The critical path tasks are highlighted red with thick edges and a 'bolt'-style icon (not read as 'failed')\n- Tasks with slack are shown neutral/grey with dashed edges, distinct from the critical chain\n- Each task shows its duration (e.g. '6d') in a meta line\n- Each task shows its owner (and optionally slack) in a sub line\n- A two-row legend distinguishes critical path (red) from has-slack (grey-dash)\n- No task nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_pr-review": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'pr-review'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Code-entity nodes are colored by diff status: added=green, removed=red(strikethrough), modified=amber, unchanged=neutral\n- Each node carries a kind eyebrow tag (module/function/class/file)\n- Each node shows its file path in a sub line\n- Changed nodes show a LOC delta (e.g. +42) in a meta line\n- Edges to removed entities are dashed/red, distinct from other edges\n- A legend maps the diff-status colors to added/removed/modified/unchanged\n- The change graph is laid out top-to-bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_okr-tree": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'okr-tree'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The tree shows three levels: Objective at top, Key Results below, Initiatives at the bottom\n- Nodes carry kind tags (objective / key result / initiative)\n- Objective and key-result nodes show a percent/progress figure in a meta line\n- Nodes are colored by status: on-track=green, at-risk=amber, behind=red\n- Edges are colored to match their target node's status\n- A legend maps the colors to on-track / at-risk / behind\n- The tree is laid out top-to-bottom from objective to initiatives\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_build-pipeline": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'build-pipeline'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each pipeline stage (build, test, scan, deploy) is a node\n- Stages are colored by status: passed=green, running=amber, failed=red\n- Stages connect in execution order with arrowheads\n- Stages show a duration or step detail in a meta/sub line\n- If a stage fails, it is clearly red and the downstream stages reflect being blocked/skipped\n- A legend maps status colors to passed/running/failed\n- No stage nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_k8s-topology": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'k8s-topology'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Resources are grouped into labeled namespace/cluster zones\n- Distinct resource kinds (Ingress, Service, Deployment, Pod, ConfigMap, Secret, Node) are shown as labeled nodes\n- The ingress\u2192service\u2192deployment\u2192pods traffic path is traceable via edges\n- Edges are colored by purpose (traffic / manages / config-mount / scheduled-on) and distinguishable\n- Pods show a status (e.g. Running) and are color-coded\n- A legend maps edge colors to their purpose\n- No resource nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_security-threat-model": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'security-threat-model'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- An attacker goal / protected asset is the root node\n- Attack vectors (kind=vector) branch from the asset with descriptive subs\n- Each vector connects to one or more mitigation nodes (kind=mitigation)\n- Nodes/edges are colored by mitigation state: mitigated=green, partial=amber, open=red\n- An unmitigated gap is shown with a dashed red edge or an explicit 'gap' label\n- A legend maps colors to mitigated / partial / open\n- The threat tree is laid out top-to-bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_sequence-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sequence-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each participant has a labeled header box with a vertical lifeline beneath it\n- Messages are drawn as horizontal arrows between lifelines in vertical time order (top = earliest)\n- Request/call arrows are visually distinct from response/return arrows\n- Every message arrow is labeled with the call/response it represents\n- Every declared participant sends or receives at least one message\n- Lifelines are evenly spaced and vertical, with no overlapping arrows\n- Each arrow clearly points from sender to receiver\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_call-hierarchy": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'call-hierarchy'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A root function is at the top of the hierarchy\n- Called functions are nested as children under their callers\n- Each function node shows its file/location\n- If a before/after diff, nodes are colored by change (added/removed/modified/unchanged)\n- Changed nodes carry a note explaining the change\n- Relevant nodes show meta info (e.g. timing or call count)\n- The hierarchy reads cleanly as a tree (clear parent-child nesting, no tangle)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_c4-architecture": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'c4-architecture'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Three flow panes are stacked top-to-bottom as increasing zoom levels\n- The top pane is a System Context view (the system as a box with users and external systems)\n- The middle pane is a Container view (apps/services/data stores inside the system)\n- The bottom pane is a Component view (building blocks inside one container)\n- Edges are labeled with the interaction and protocol\n- Each level distinguishes person / this-system / external (color or legend)\n- Each level is a small, clean diagram (roughly 7 nodes or fewer)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_ml-pipeline": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'ml-pipeline'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each pipeline stage is a clearly labeled node (ingest, features, train, eval, deploy)\n- Stages are connected in execution order with directed arrows\n- An eval/validation gate before deployment is explicitly shown\n- Data/model artifacts passed between stages are labeled\n- A branch on eval pass/fail (deploy vs retrain) is shown\n- The pipeline is laid out top-to-bottom\n- No edge passes through an unrelated stage node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_terraform-resource-graph": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'terraform-resource-graph'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each resource/data source is a labeled node showing its type\n- Directed edges show resource dependencies (what depends on what)\n- The provider/root config is identifiable as a source node\n- The graph implies a clear creation order (sources before dependents)\n- Resources are visually distinguished by type or tone\n- The graph is laid out top-to-bottom\n- No edge passes through an unrelated resource node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_cloud-architecture": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'cloud-architecture'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Resources are grouped into labeled zones (VPC, public/private subnets, tiers)\n- Each resource is a labeled node showing its service/type\n- Directed edges show the request/traffic flow between components\n- The internet/user entry point and the cloud boundary are shown\n- Managed services (DB, queue, cache) are distinguished from compute\n- A legend or tones explain the zone/resource types\n- Nodes and zones do not overlap; edges avoid unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_network-topology": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'network-topology'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- VPC(s) and their public/private subnets are shown as zones\n- Internet/NAT gateways and their placement are shown\n- Routing between subnets/gateways is indicated by edges or labels\n- Security groups / firewall rules are represented\n- Subnets/VPCs are labeled with their CIDR ranges\n- Edges show the allowed traffic direction\n- Zones and nodes do not overlap; edges avoid unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_terraform-module-tree": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'terraform-module-tree'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The root module is the top of the hierarchy\n- Child modules are nested under their calling module\n- Each module lists the resources it creates\n- Module inputs/outputs (or key variables) are indicated\n- Reused/shared modules are identifiable\n- The composition hierarchy reads top-down clearly\n- Title names the root configuration\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_user-flow": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'user-flow'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each node represents a screen/state of the app\n- Edges are labeled with the user action/trigger causing the transition\n- A clear entry/start screen is identifiable\n- Decision points (e.g. logged-in vs not, success vs error) branch\n- A primary happy-path is distinguishable\n- End/goal states are marked\n- Laid out top-to-bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_sitemap": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sitemap'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The home/root screen is the top of the hierarchy\n- Pages/sections nest under their parent\n- Navigation groups/sections are clear\n- Multiple depth levels are shown (primary nav \u2192 sub-pages)\n- Each screen/page is clearly labeled\n- Laid out top-to-bottom as a tree\n- No edge passes through an unrelated page node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_generic-flow": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'generic-flow'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A consistent layout direction (top-down or left-right) suited to the content\n- Every node has a concise, meaningful label\n- Edges are labeled wherever the relationship is not obvious\n- Nodes are color-coded by role/type, with a legend when colored\n- Appropriate level of detail \u2014 not too sparse, not overcrowded\n- No node overlaps and no edge crosses through an unrelated node\n- The diagram fully covers the requested topic with no obvious gaps\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo3b_graph/report.md b/scripts/experiments/gepa-flowchart/overnight/topo3b_graph/report.md new file mode 100644 index 0000000..5e98344 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo3b_graph/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: architecture-zones, swimlane-process, user-journey, state-machine, er-diagram, class-diagram, event-driven, query-plan, data-lineage, recursion-tree, critical-path, pr-review, okr-tree, build-pipeline, k8s-topology, security-threat-model, sequence-diagram, call-hierarchy, c4-architecture, ml-pipeline, terraform-resource-graph, cloud-architecture, network-topology, terraform-module-tree, user-flow, sitemap, generic-flow +shared skills: artifact_note, board_layout, graph_entity_lanes, graph_process_spine, graph_zoned_tiers +shared skills CHANGED by GEPA: (none) + +**Seed 0.4995 -> Best 0.4995 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_chart/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo_chart/best_topology_skills.json new file mode 100644 index 0000000..fcb2da5 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_chart/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "chart_internal": "CHART: emit a COMPLETE inline Vega-Lite spec (never a description). Title + subtitle; axis titles WITH units; set width to \"container\". Sort categories explicitly (usually descending). Put value labels on marks and a legend when there are multiple series; make the key insight obvious.", + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_training-curves": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'training-curves'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Separate train and validation curves are both plotted\n- X axis is the training step/epoch with a clear axis title\n- Y axis is the loss/metric with a clear axis title\n- The train/val divergence (overfitting gap) is visible and legible\n- The best checkpoint / early-stop point is marked (rule or point)\n- A legend distinguishes the train vs validation series\n- Chart has a descriptive title and subtitle naming the run\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_confusion-matrix": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'confusion-matrix'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- An N\u00d7N grid of actual (rows) vs predicted (columns) classes\n- Each cell shows its count or rate as a text label\n- Cells are color-encoded by value with a sequential scale\n- Both axes are labeled (actual vs predicted) with the class names\n- The correct-prediction diagonal is distinguishable from off-diagonal errors\n- A color legend/scale is present\n- Title names the classifier and dataset\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_ablation-study": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'ablation-study'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- One bar per ablated component/variant\n- The full/baseline model is shown as a reference bar or rule\n- The drop/delta from removing each component is clear\n- Y axis is the eval metric with a clear axis title\n- Variants are ordered by impact on the metric\n- Bars are labeled with their metric values\n- Title names the model and the metric being ablated\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_infra-cost-breakdown": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'infra-cost-breakdown'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A bar (or stacked bar) chart of cost by service/resource category\n- The cost axis has a title and currency units\n- Each category/service is clearly labeled\n- Categories are sorted by cost (largest first)\n- A total and/or per-bar value labels are shown\n- The top cost driver is visually highlighted\n- Title names the account/environment and the period\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_chart/report.md b/scripts/experiments/gepa-flowchart/overnight/topo_chart/report.md new file mode 100644 index 0000000..ff2c8be --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_chart/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: training-curves, confusion-matrix, ablation-study, infra-cost-breakdown +shared skills: artifact_note, board_layout, chart_internal +shared skills CHANGED by GEPA: (none) + +**Seed 0.4221 -> Best 0.4481 (+0.0261)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_comparison/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo_comparison/best_topology_skills.json new file mode 100644 index 0000000..841218c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_comparison/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "comparison_grid": "COMPARISON LAYOUT: equal-width option cards side by side with IDENTICAL aligned rows; add a delta column (arrow + sign + value + %) where it applies; flag the best/recommended option with a badge (tone grey\u2192green). For products, show an image and an outbound link per option. Limit to the 3-4 most relevant options and keep each card compact (small image, ~4-5 spec rows) so the row fits on one screen without horizontal overflow.", + "board_layout": "TASK: You are designing a single-screen \"board plan\" \u2014 a text spec for a data-visualization board that another system will render.\n\nINPUT FORMAT: You receive four fields:\n- Journey: the board archetype (e.g., before-after, model-leaderboard, shopping-comparison, product-comparison). Suffix \"(component)\" indicates a single focused component.\n- Use case: the specific content/metrics to display.\n- Reader: the target audience.\n- Goal: the decision or insight the reader should walk away with.\n\nOUTPUT: A structured markdown board plan optimized for FOUR scored dimensions:\n- comprehension \u2014 clarity of THE insight at a glance. THIS IS ALWAYS THE WEAKEST DIMENSION (historically ~0.45\u20130.50) AND THE TOP PRIORITY. Most failures come from boards that are data-complete but make the reader work to extract the verdict. Design so a glance answers the Goal.\n- visual_quality \u2014 historically maxes at 1.00 with the formatting below; keep it.\n- geometry \u2014 clean fit on ONE screen with NO clipping. Dense wide tables overflow; you MUST control width/row count.\n- rubric \u2014 every required element present and correct; historically 1.00. Keep it.\n\n====================================================================\nPRIORITY #1 \u2014 COMPREHENSION (spend the most effort here)\n====================================================================\nThe reader must learn the answer to the Goal in under 3 seconds, BEFORE reading any table. Do all of the following:\n\n1. VERDICT BANNER (mandatory, immediately under the title): One bold, plain-language sentence that directly answers the Goal naming the winner/result and the SINGLE most decision-relevant reason. Examples:\n - \"\u2192 Pick DigitalOcean: same 2 vCPU/8 GB specs as AWS for $12/mo less (\u221220%).\"\n - \"\u2192 Optimization cut bundle 58% and load time 1.9 s \u2014 ship it.\"\n - \"\u2192 Canary-1B: most accurate (4.8% WER) AND real-time fast (RTF 0.08).\"\n Make this the visually dominant element after the title \u2014 not a buried footer.\n\n2. ONE HERO METRIC: Identify the single number that decides the Goal and make it the largest, most prominent value on the board. Everything else is supporting context.\n\n3. PLAIN-LANGUAGE LABELS: Annotate what \"good\" looks like inline (e.g., \"lower = better\", \"best\", \"cheapest\", \"winner\") so the reader never has to infer direction or compute comparisons themselves.\n\n4. PRE-COMPUTE THE COMPARISON: Never make the reader do mental math. Show deltas, percentages, \"X\u00d7 slower/faster\", \"$Y cheaper than average\" directly next to values. The insight should be readable, not derivable.\n\n5. SUPPRESS NOISE: Cut any metric/column/row that does not help answer the Goal. Fewer, decision-relevant numbers beat exhaustive ones for comprehension.\n\n====================================================================\nPRIORITY #2 \u2014 GEOMETRY (no clipping on one screen)\n====================================================================\n- Hard caps to avoid overflow: tables max ~6\u20137 columns AND ~6 rows. If a leaderboard has more metrics, drop secondary columns or fold them into a compact \"Score/Notes\" cell. Wide many-column tables (e.g., 9 columns) DO clip \u2014 split, trim, or summarize instead.\n- Prefer fewer, denser rows over many; keep every cell terse (a value + a short flag, not a sentence).\n- Keep the whole board top-to-bottom and shallow; do not stack many sections.\n\n====================================================================\nCORE RULES\n====================================================================\n1. LAYOUT: State orientation explicitly (\"Top-to-bottom flow\", \"read left\u2192right\"). Lay the board top-to-bottom. Give a TITLE naming both the SUBJECT and its SCOPE with real qualifying details: version, dataset, hardware, date, test conditions (e.g. \"COCO val2017 mAP vs Speed (V100, 640\u00d7640, Jan 2025)\", \"load test, 500 RPS sustained\", \"Lighthouse 11 mobile, Slow 4G, Mar 2025\").\n2. COMPLETENESS: Include EVERY element the rubric requires; never drop one. Keep each concise so the whole board fits one screen.\n3. REAL VALUES ONLY: Specific, plausible, internally-consistent numbers \u2014 never placeholders. Verify all derived values (deltas, %, \"best/lowest\" flags) arithmetically; a wrong \"best\" flag costs rubric points. Use real brand/product/model names.\n4. STRUCTURE BY TYPE:\n - Comparison/leaderboard/shopping = equal-width option cards side by side, IDENTICAL aligned rows in the same order so values compare horizontally.\n - Report = stacked rows read in order.\n - Dashboard = balanced 2\u00d72 grid of peer tiles.\n - Title every pane/section.\n\n====================================================================\nARCHETYPE-SPECIFIC GUIDANCE\n====================================================================\nbefore-after:\n- Two equal-width cards: LEFT=\"BEFORE\" (grey header + baseline/version subtitle); RIGHT=\"AFTER\" (green header + version subtitle).\n- Same metrics in same order on both for row alignment.\n- AFTER card gets a \u0394 column: arrow + sign + absolute value + percent.\n- Use \u25bc where lower=better (latency, error rate, memory, bundle size, requests, load time); \u25b2 where higher=better (throughput, Lighthouse/score). GREEN for improvement, RED for regression.\n- Headline improvement badge near the title (grey baseline \u2192 green) summarizing the biggest wins.\n- Add a VERDICT BANNER answering the Goal in plain words.\n- Footer: \"Deltas = After \u2212 Before; % relative to Before; same hardware/network; median of N runs.\"\n\nmodel-leaderboard:\n- Each model = one clearly separated ROW (common rubric weakness \u2014 keep model rows complete and distinct). Metrics = aligned columns.\n- Rank rows best\u2192worst by the primary balance metric.\n- KEEP COLUMNS \u2264 ~6 to avoid clipping: typically Rank, Model, primary accuracy metric, speed metric, Params/FLOPs (pick the more relevant), Score. Fold extra dataset columns into an averaged metric or notes if needed.\n- Flag per-metric best cell with \ud83c\udfc6 (grey\u2192green). Put \ud83c\udfc5 WINNER badge (green) on the best card with a one-line justification.\n- Show delta-vs-winner notes (e.g., \"+0.4pp WER, 2\u00d7 faster\") so trade-offs are obvious.\n- Footer: define any Score formula + sources/links + measurement conditions.\n\nshopping-comparison / product-comparison:\n- 3+ equal-width cards, identical row order: logo/image tile \u2192 product/plan name (+ badge) \u2192 prominent LARGE price \u2192 4\u20135 aligned spec rows \u2192 star rating + review count \u2192 outbound link button with real domain.\n- Flag the best with a green \"BEST VALUE\" badge AND show its price delta vs peer average (\u25bc \u2212$X / \u2212Y%).\n- VERDICT BANNER must state which to choose and why (e.g., \"same specs, $X cheaper\").\n\nFINISH WITH: A brief rubric-satisfaction checklist confirming each required element is present, including the verbatim arithmetic check of every delta/percentage.", + "artifact_note": "TASK\nYou generate a single UI component as a JSON tree (Mantine-style components) that renders into an image. A grader scores the RENDERED IMAGE on four axes: comprehension, visual_quality, geometry, and rubric. Your job is to maximize all four.\n\nINPUT FORMAT\nYou receive a short spec:\n- Journey: the component archetype (e.g. shopping-comparison, model-leaderboard)\n- Use case: the specific content and which items/columns must appear, including what to flag (cheapest, winner, best-per-metric, etc.)\n- Reader: the target audience\n- Goal: the decision the reader needs to make\n\nOUTPUT FORMAT\nEmit ONLY a valid JSON object describing the component tree. Every node has \"type\", \"props\", and (optionally) \"children\". Use real Mantine components: Stack, Group, SimpleGrid, Card, Title, Text, Badge, Alert, Table, List/List.Item, Image, Anchor, Divider, Checklist. children is either a string or an array of nodes.\n\nCORE PRINCIPLE\nEmit every required item as its OWN labeled, discrete element (own header / structured field / table cell / badge / inline spec) \u2014 never bury required facts in prose paragraphs. The grader scores what is visibly, distinctly rendered.\n\nHARD-WON LESSONS FROM PAST RUNS (apply all):\n\n1. GEOMETRY IS THE #1 LOSS. Every past board lost points to \"content overflows/clips.\" You MUST budget space aggressively:\n - Do NOT pack a 3-column SimpleGrid of detail cards AND a full comparison table AND a hero card AND an alert AND a guidance list all at once. Pick ONE primary presentation of the per-item data (either per-item cards OR one comparison table \u2014 not both) to avoid overflow.\n - Keep titles SHORT. Move conditions/assumptions (date, units, methodology) into a small dimmed footnote Text, not the Title. Long titles wrap and clip.\n - Inside SimpleGrid cards keep content compact: few short lines, avoid nested Tables inside narrow grid columns (they overflow horizontally).\n - Prefer one wide Table over many narrow cards when there are \u22653 items with \u22654 attributes each \u2014 tables fit the most data without clipping.\n - Drop decorative/redundant blocks (extra hero cards, duplicate winner cards, Checklists) that consume vertical space without adding required content.\n\n2. COMPREHENSION CAN TANK EVEN WITH A CLEAN RUBRIC (one board hit comp 0.50). To keep comprehension high:\n - Lead with a single clear verdict/recommendation tied directly to the Goal, in an Alert with a \"\u2192\" directive.\n - Rank items explicitly by the primary decision metric and show that ordering visibly.\n - Make the flagged/winning item unambiguous and consistent across the whole layout (don't flag two different \"best\" items, and don't let hero/alert/cards disagree on the winner).\n - State deltas in plain terms (e.g. \"\u2212$20 / \u221214% vs avg\") next to the relevant value.\n\n3. RUBRIC SPECIFICS BY ARCHETYPE:\n - shopping-comparison: rubric repeatedly lost on product_image 0.5. Placeholder images (placehold.co) score poorly. Provide a REAL, plausible product image src when possible, with a descriptive alt. Each product must show: name, price, weight, cushioning, drop, rating(+count), a buy link, and the cheapest clearly flagged. Show average price and per-item delta vs average.\n - model-leaderboard: each model = its own table row; eval metrics = columns; flag best-per-metric (\ud83c\udfc6); include params + latency/RTF per model; rank by the primary metric; declare an overall winner; include an aggregate score column; define metrics/scoring in a footnote. (This rubric already scored 1.00 \u2014 keep this structure but trim blocks to fix geometry.)\n\n4. GENERAL STRATEGY:\n - Structure: short Title \u2192 verdict Alert \u2192 ONE primary data presentation (table or compact cards) \u2192 brief plain-language guidance List \u2192 dimmed footnote with units/sources/methodology.\n - Every required attribute from the Use case must appear as a discrete labeled field/cell, with units.\n - Flag superlatives (cheapest, lightest, highest-rated, fastest, winner) inline with badges or \u2713/\u25bc markers.\n - Keep total element count modest to guarantee everything fits without clipping \u2014 completeness that overflows scores worse than a slightly leaner layout that renders fully.\n\nOutput the JSON only, no commentary.", + "tail_shopping-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'shopping-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each shopping option is its own card in a comparison grid\n- Each option shows a prominent product image\n- Each option shows its price\n- The cheapest (or best-value) option is flagged with a badge\n- Each option lists key specs/features for comparison\n- Each option has an outbound buy/visit link\n- Each option shows a rating or review score\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_product-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'product-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a visual comparison board made of side-by-side option cards in a grid. Output a concise plain-text plan naming the SPECIFIC content (real product names, real prices, real ratings, real features, real outbound URLs) that satisfies each rubric criterion. Generic plans fail \u2014 use concrete, realistic values.\n\nRUBRIC \u2014 satisfy EVERY criterion, explicitly, for EVERY option:\n- Each option is its own card in a grid\n- Each option shows a PROMINENT product image/logo \u2014 describe it as a large, full-card-width image tile (an actual product photo or brand logo). Do NOT describe it as a single letter, initial, monogram, placeholder, or small icon. This is the most-failed criterion: make every image clearly a real, large product photo (e.g. \"Large product photo of the MacBook Air (Midnight, lid open), full card-width hero tile at top of card\").\n- Each option shows its price (concrete currency value)\n- Each option shows a star rating or score badge (e.g. \u2605\u2605\u2605\u2605\u2606 4.4/5 with review count)\n- Each option lists its key features (use IDENTICAL, aligned feature rows across all cards in the same order so values compare horizontally)\n- Each option has a clickable outbound 'Visit site \u2192' link with a real domain\n- A standout/recommended option is flagged with a badge (e.g. 'Popular', 'Best Value')\n\nLAYOUT RULES (critical \u2014 geometry is scored, and content overflow/clipping loses points):\n- Lay out options as equal-width cards in a single row grid, read left\u2192right.\n- KEEP CONTENT WITHIN CARD BOUNDS. Avoid overflow/clipping: limit each card to ~4\u20135 short feature rows, keep labels terse, and do not pack extra annotations (delta vs average, percentage comparisons, verification notes, long subtitles) that bloat the card and cause clipping. Prefer brevity over completeness.\n- Within each card, order top-to-bottom: large image tile \u2192 product name (+ badge) \u2192 price \u2192 rating \u2192 key feature rows \u2192 'Visit site \u2192' button.\n- Rank/order cards meaningfully (e.g. by value or recommendation), and flag exactly one primary recommended card with a prominent badge.\n\nOUTPUT STRUCTURE:\n1. TITLE \u2014 a clear comparison title with scope (region/currency, date/config basis) kept short.\n2. LAYOUT \u2014 orientation, number of cards, shared row order.\n3. One block PER CARD with: image (described as large real photo), name, badge if any, price, rating, key features (aligned rows), 'Visit site \u2192' link.\n4. A short rubric checklist confirming each criterion is met.\n\nKeep the whole plan concise. Concrete real values beat exhaustive detail. Do not let any card's content exceed what fits cleanly in a grid cell.", + "tail_model-leaderboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'model-leaderboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each model is its own row in a comparison table\n- Multiple eval metrics are shown as columns\n- The best value per metric (or the winning model) is flagged with a badge\n- Model size/params or cost/latency is shown for each model\n- Models are ranked/sorted by a primary metric\n- An overall winner or aggregate score is indicated\n- Title states the benchmark/dataset the models were evaluated on\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_before-after": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'before-after'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A two-column layout shows a Before card and an After card side by side\n- Each card contains a table of the same metrics for comparison\n- The After card includes a delta (\u0394) column showing the change\n- A headline badge shifts color (e.g. grey \u2192 green) to signal improvement\n- The cards are clearly labeled 'Before' and 'After'\n- Metric rows align across both cards so values are directly comparable\n- The direction of improvement is visually clear (color or sign on deltas)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_comparison/report.md b/scripts/experiments/gepa-flowchart/overnight/topo_comparison/report.md new file mode 100644 index 0000000..ff53cce --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_comparison/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: shopping-comparison, product-comparison, model-leaderboard, before-after +shared skills: artifact_note, board_layout, comparison_grid +shared skills CHANGED by GEPA: artifact_note, board_layout + +**Seed 0.7385 -> Best 0.8986 (+0.1602)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_dashboard/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo_dashboard/best_topology_skills.json new file mode 100644 index 0000000..a7ed15a --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_dashboard/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "dashboard_grid": "DASHBOARD LAYOUT: a balanced grid of peer tiles \u2014 include every panel the rubric names. Use stat cards (dim label / large value) with a polarity badge tied to a named threshold; set embedded charts' width to \"container\". Keep each tile concise; never drop a required panel to save space.", + "board_layout": "You are designing a BOARD PLAN: a layout specification for a dashboard or report visualization. \n\nINPUT FORMAT: You receive a Journey (board type + form factor: \"component\" = single-focus board, \"panes\" = multi-section board), a Use case (what the board shows), a Reader (the audience), and a Goal (what they need to accomplish).\n\nYOUR #1 PRIORITY IS READER COMPREHENSION. The board must be instantly understandable by the stated Reader to achieve the stated Goal. A beautiful, complete board that the reader can't quickly parse is a failure. Specifically:\n- Lead with the answer: the single most important number/status the reader needs should be the largest, most prominent element. The reader should grasp the headline state in under 3 seconds.\n- Write for the reader's mental model, not the system's. Translate internal jargon, codes, and raw metrics into plain meaning. Always pair any code/ID/technical value with a short human-readable explanation of what it means and whether it's good or bad.\n- Every number must carry its interpretation: include the threshold/target it's compared against and an explicit good/bad/neutral signal (color + direction). Never make the reader do math or guess if a value is healthy.\n- Cut anything that doesn't serve the Goal. Prefer fewer, clearer elements over exhaustive detail. Density is the enemy of comprehension.\n- For data tables and panels (e.g. error tables, top-N lists), make each row self-explanatory: clear column headers, a description column, and a sortable/ranked primary metric. Highlight the rows that demand action.\n\nLAYOUT RULES:\n- Lay the board top-to-bottom and state the direction explicitly.\n- Give it a title naming the subject AND its scope (time window, region, version \u2014 e.g. \"last 5 min, US-East region\").\n- For a multi-section board, use panes: rows for a report read in order; a balanced 2\u00d72 grid for a dashboard of peer tiles. Title each pane.\n\nFIT THE SCREEN \u2014 DO NOT CLIP (geometry is scored):\n- The whole board must fit one screen without overflow or clipping. This is a hard constraint.\n- Budget your space before adding content. If you have many elements, make each more concise rather than letting the board grow past one screen.\n- Keep timelines, checklists, and tables short (cap long lists to the top ~5 most important items; summarize the rest in one line if needed).\n- Keep text in each element brief \u2014 short phrases, not sentences. Trim parenthetical asides and explanatory footnotes that add length.\n- A clipped board scores worse than a slightly less detailed one. When in doubt, shorten.\n\nRUBRIC COMPLETENESS:\n- Include EVERY element the use case / rubric requires (e.g. SLO ring + badge, latency chart with p50/p95/p99, errors table, throughput chart, IC, timeline, action items, stat cards). Do not drop a required element to save space \u2014 instead make each element more concise.\n- Use real, specific, realistic values \u2014 never placeholders.\n- Every chart must have a title and named X and Y axes; state chart type and use \"width\": \"container\" so it fits its card.\n- Stat cards: dimmed label on top, large bold value in center, threshold-tied polarity badge (color + arrow).\n- Mix component types where required (some stat/ring/table components AND some charts).\n\nOUTPUT: A clear, structured board plan. After the layout, optionally include a brief color key and a one-line rubric-coverage check, but keep these minimal \u2014 they should not crowd the board.", + "artifact_note": "TASK\nYou generate the structured spec for a single rendered \"board\" (an information visualization) from a small brief. Each input gives you:\n- Journey + a rendering mode in parentheses: one of\n \u2022 (panes) \u2192 output a grid of panes: {\"layout\":\"grid\",\"panes\":[{title,type,content}]} where type \u2208 \"markdown\" | \"vegalite\"\n \u2022 (flow) \u2192 output a node/edge graph: {\"direction\",\"legend\",[{label,color}],\"nodes\":[{id,type,width,height,data:{label,status,sub,icon?,spark?}}],\"edges\":[{source,target,data?{label},style,markerEnd}]}\n \u2022 (component) \u2192 output a nested UI component tree (Mantine-style: Stack/SimpleGrid/Card/Title/Text/Badge/VegaLite with props + children)\n- Use case: the concrete things that MUST each appear (e.g. \"SLO ring, p50/p95/p99 chart, top-errors table, throughput chart\").\n- Reader + Goal: who scans it and what they must learn at a glance.\n\nOutput ONLY the JSON/spec object for the requested mode. Invent realistic, internally-consistent sample data.\n\nCORE RULE\nEmit every required item from the Use case as its OWN labeled, discrete element (own pane / own node / own card / own header / own chart series / own table row) \u2014 never bury required facts in prose. The grader scores the RENDERED IMAGE, so anything that must be \"read\" must be a visible, separated, fully-on-canvas element.\n\nThe grader rewards four things; optimize all of them:\n1. comprehension (HIGHEST-WEIGHT, and the weakest in past runs ~0.3\u20130.5) \u2014 see COMPREHENSION below.\n2. visual_quality\n3. geometry \u2014 NOTHING may overflow, clip, or fall off-canvas. This has repeatedly cost points.\n4. rubric \u2014 every required item present; all values readable.\n\nCOMPREHENSION (most important, currently failing)\nThe reader must instantly extract the Goal. So:\n- Lead with a single headline verdict element: one short, explicit status line (e.g. \"CHECKOUT STATUS: DEGRADED \u2014 root cause: Payments DB DOWN\", or \"All metrics on target except churn\"). Make the bottom-line answer impossible to miss.\n- Reduce density. Fewer words per element. Comprehension dropped when elements were stuffed with metrics + targets + sparklines + emojis all at once. Keep each element to: name, the one key value, one comparison/status.\n- Make the single most important element visually dominant (size/color), and de-emphasize the rest.\n- Don't rely on legends/emoji-keys to carry meaning; state status in words on the element itself.\n- Keep targets/secondary detail brief and secondary, not competing with the primary number.\n\nGEOMETRY (must be clean \u2014 past boards lost points to overflow & off-canvas nodes)\n- Panes mode: keep pane content from overflowing. Trim markdown so tables/text fit; for charts use \"width\":\"container\" and a conservative \"height\" (\u2248180\u2013220). Do not pack a pane with a long table AND extra notes that will clip.\n- Flow mode: lay nodes out so ALL of them stay on canvas. Past failures had 3 off-canvas / off-canvas nodes. Account for node width/height when placing; don't create rows wider than the canvas. Keep total horizontal span modest; prefer fewer columns and vertical stacking. Banner/legend full-width nodes must not exceed the layout width spanned by the node row beneath them. Be conservative with node count and sizes.\n- Component mode: SimpleGrid cols should match item count without overflow; keep card text short so values aren't truncated.\n\nRUBRIC / READABILITY\n- Include every item named in the Use case, each as its own discrete element.\n- values_readable was dinged: ensure numbers are large, uncropped, and not clipped by container bounds. Don't let long badge/label strings overflow their card.\n- Use clear units and a brief comparison (vs target / MoM / direction arrow) on each metric.\n\nDATA & STYLING CONVENTIONS (reuse these patterns)\n- Status color scheme: green #22c55e = healthy/on-target; amber/yellow #f59e0b/#eab308 = degraded/near-target; red #ef4444 = down/off-target. Mirror with \ud83d\udfe2/\ud83d\udfe1/\ud83d\udfe0/\ud83d\udd34 sparingly.\n- Charts (VegaLite): use threshold/target lines via a \"rule\" mark with strokeDash [6,4]; color multi-series by category with the status palette; reverse the x-axis for \"minutes ago\" time series; use area+line for throughput, multi-line for percentiles.\n- Tables (markdown): bold the row that breaches its target; include a target note; keep to the few rows that matter.\n- Always state the target alongside each metric so status is justified, but keep it compact.\n\nCHECKLIST BEFORE RETURNING\n\u25a1 One dominant headline verdict answering the Goal.\n\u25a1 Every Use-case item is its own labeled element.\n\u25a1 Each element is low-density: name + key value + one status, not a wall of stats.\n\u25a1 All elements fit on canvas; nothing overflows, clips, or goes off-canvas (check node sizes vs layout width; chart heights vs pane).\n\u25a1 Numbers large and fully readable; units + comparison present.\n\u25a1 Output is only the spec object for the requested mode.", + "tail_metrics-dashboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'metrics-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning a single-component metrics dashboard board. Produce a concise plain-text plan that names SPECIFIC, concrete content (real numbers, real labels, real structure). Generic plans fail the rubric.\n\nSTRICT RUBRIC \u2014 satisfy EVERY criterion:\n- A row of stat cards each shows a dimmed label and a large value\n- Stat cards carry status badges (color-coded) where relevant\n- An embedded trend chart (VegaLite) is present inside a card\n- The dashboard has a clear title heading\n- The embedded chart fits its card width (not bleeding outside)\n- Content is arranged in a clean top-down stack (title, stats row, chart)\n- All stat values and labels are legible and aligned\n\nLAYOUT RULES:\n- Use a strict top-to-bottom stack: (1) Title heading, (2) Stats row, (3) Trend chart card.\n- The stats row holds 3\u20134 equal-width peer cards with aligned baselines.\n- The chart card is full width and contains a VegaLite spec using \"width\": \"container\" so it never bleeds outside the card.\n\nCONTENT GUIDANCE (be concrete):\n- Title: name the entity/product, the time period, and a short subtitle stating what the dashboard shows (e.g. \"Business Metrics \u2014 Acme SaaS, March 2024 (MoM)\" with subtitle \"Headline KPIs at a glance \u00b7 updated daily\").\n- Each stat card needs three parts:\n - Dimmed label in ALL CAPS naming the metric and its window (e.g. \"MONTHLY RECURRING REVENUE\", \"ACTIVE USERS (30-day)\").\n - Large bold value with the actual number/unit (e.g. \"$48,200\", \"1,840\", \"4.1%\", \"22%\").\n - Color-coded status badge with direction arrow, delta, and context vs target (e.g. \"\ud83d\udfe2 \u25b2 +12% vs Feb \u00b7 target $45K met\", \"\ud83d\udd34 \u25b2 +0.6pt \u00b7 above ceiling (bad)\", \"\ud83d\udfe1 \u25bc \u22121pt \u00b7 below target (watch)\").\n- Color key convention: \ud83d\udfe2 healthy/on-target \u00b7 \ud83d\udfe1 watch/below target \u00b7 \ud83d\udd34 breach/bad. Note that for some metrics (like churn) an increase is bad, so pick the badge color by whether the trend is GOOD, not by arrow direction.\n- Trend chart card: give it a title (e.g. \"MRR Trend \u2014 last 6 months\"), specify VegaLite line chart with \"width\": \"container\", name the X axis (months with real labels), Y axis (metric + unit), the actual data series values, an optional dashed target reference line, and a highlighted last point.\n\nIMPROVING COMPREHENSION (most important \u2014 prior plans were clear visually but weak on comprehension):\n- Make the plan readable as a story a reader can follow without seeing the board. Explicitly tie each metric back to the reader's goal \u2014 state what each number TELLS the reader and why it matters for the stated purpose.\n- For each stat, add a one-line plain interpretation (e.g. \"Revenue is accelerating and now clears the $45K target\"). Don't just list numbers \u2014 explain them.\n- Ensure the chosen metrics directly match the use case and that the chart visualizes the headline metric the reader most cares about.\n- Keep wording plain, specific, and self-explanatory; avoid jargon the stated reader wouldn't know.\n\nOUTPUT FORMAT:\n- Start with a one-line layout direction.\n- Then sections: TITLE, STATS ROW (one block per card with Label / Value / Badge / Interpretation), TREND CHART CARD (with spec details).\n- End with a color key and a short rubric checklist confirming each criterion is met.\n\nOutput only the plan in concise plain text.", + "tail_service-health": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'service-health'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each service tile is colored by health: healthy=green, degraded=amber, down=red\n- Each service tile shows an inline sparkline (latency/throughput trend)\n- Each tile shows a current SLI value (e.g. 'p95 214ms') in a meta line\n- Each tile shows the service role/purpose in a sub line\n- Edges connect services in dependency order, colored to match the target's health\n- A legend maps the health colors to healthy/degraded/down\n- No tiles overlap and edges do not pass through unrelated tiles\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_observability-dashboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'observability-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Panes are arranged as an independent-tile grid (a 2x2 mission-control board)\n- An SLO panel with ring progress and a status badge is present\n- A latency chart (e.g. p50/p95/p99) pane is present\n- A top-errors table and/or alert pane is present\n- A throughput chart pane is present\n- The grid mixes component panels and chart panels\n- Each chart pane has a title and axis titles\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_sre-incident": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'sre-incident'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A header states the incident severity, status, and a one-line summary\n- A timeline of incident events (detect, escalate, mitigate, resolve) is shown\n- Impact metrics (affected users, error rate, duration) are shown as stats\n- Severity/status is color-coded (e.g. red for active, green for resolved)\n- Follow-up action items or next steps are listed\n- An incident commander/owner is named\n- The page has a clear incident title\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_dashboard/report.md b/scripts/experiments/gepa-flowchart/overnight/topo_dashboard/report.md new file mode 100644 index 0000000..fb52d17 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_dashboard/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: metrics-dashboard, service-health, observability-dashboard, sre-incident +shared skills: artifact_note, board_layout, dashboard_grid +shared skills CHANGED by GEPA: artifact_note, board_layout + +**Seed 0.7652 -> Best 0.7732 (+0.0080)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_graph/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo_graph/best_topology_skills.json new file mode 100644 index 0000000..e186a18 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_graph/best_topology_skills.json @@ -0,0 +1,11 @@ +{ + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "graph_entity_lanes": "GRAPH LAYOUT (entity/relationship): each entity is a type:\"entity\" node with its attributes in data.fields[] (one {name,type,key?} per row, mark PK/FK) and an explicit width/height \u2014 NEVER cram fields into data.label (that makes one giant overlapping box). Arrange entities in columns; draw each relationship as a labeled edge with cardinality (1, 0..1, 1..*) and an arrowhead. Every entity participates in at least one relationship.", + "graph_process_spine": "GRAPH LAYOUT (process flow): keep the happy path on one straight top-to-bottom spine; send branch/exception steps to the side with edges routed AROUND \u2014 never through \u2014 unrelated nodes. ~5-10 nodes. Label every edge with its trigger/condition and give it a visible arrowhead; color nodes by role via type:\"change\"+status and add a legend.", + "graph_zoned_tiers": "TASK\nYou produce a BOARD PLAN for a diagramming canvas. Input is a spec with fields like:\n- Journey/diagram type (e.g., \"cloud-architecture (flow)\")\n- Use case (the system to diagram)\n- Reader (audience, e.g., SRE)\n- Goal (e.g., onboard a new engineer)\n\nOutput a plan for a GRAPH LAYOUT with a ZONED ARCHITECTURE: group nodes into labeled zones/tiers via groups[]; show the external entry point and the request/data flow as directed edges between components; distinguish managed services from compute; add a legend. Keep zones from overlapping and route edges around unrelated nodes.\n\nCRITICAL \u2014 KEEP EVERYTHING ON CANVAS (this is the #1 failure mode)\n- Assume a single fixed canvas (treat it as ~1280\u00d7720, origin top-left, all coordinates >= 0 and within bounds). EVERY node, zone box, legend, and label must fit fully inside these bounds with margins (keep ~24px padding from all edges). Do not let any node, arrowhead, or label spill off-canvas.\n- Before finalizing, do an explicit bounds check: list each zone with its (x, y, width, height) and confirm none exceeds canvas bounds and none overlap. State \"bounds check: all N nodes on-canvas, 0 overlaps.\"\n- Budget space deliberately: count your nodes first, then size zones so the total stacked height/width fits. If there are many nodes (>10), use a more compact grid (multiple columns per tier) rather than one tall column that runs off the bottom.\n- Do NOT add a separate \"Internet\" zone floating above the cloud boundary if it pushes content off-canvas; place the external entry point as a node near the top inside the canvas margin.\n\nPRIORITIZE COMPREHENSION\n- The plan must be directly usable to build the board. Give each node a short, clear label and a zone assignment. Keep the structure simple enough that a reader achieves the stated Goal (e.g., a new engineer can trace the request flow at a glance).\n- Order zones along the flow direction (entry point first \u2192 data stores last). Keep flow direction consistent (default top-to-bottom).\n- Don't over-pack with detail (CIDRs, versions, AZs) at the expense of clarity; include only what serves the reader's goal. Detail is fine but must not crowd the layout or cause overflow.\n\nZONES / GROUPS\n- Use nested groups[] for tiers: outer cloud/account boundary (dashed), then VPC, then subnets/tiers (edge, compute, data).\n- Common zoned-architecture tiers for cloud/Kubernetes inputs:\n - Edge/Ingress tier: DNS (Route 53), WAF, Load Balancer / Ingress controller, NAT Gateway.\n - Compute tier: Kubernetes Services (ClusterIP), Deployments/Pods (frontend, api).\n - Data tier: managed DB (RDS PostgreSQL, Multi-AZ), cache (ElastiCache Redis), queue (SQS).\n- Maintain gutters (~40px) between zone boxes; zones must never overlap.\n\nDIRECTED EDGES (request/data flow)\n- Draw the path from external entry point through each tier to data stores, e.g.:\n End User/Browser \u2192 Route 53 \u2192 WAF \u2192 ALB/Ingress \u2192 Service \u2192 Pods \u2192 DB/Cache/Queue.\n- Include internal service-to-service calls (e.g., frontend pods \u2192 api service) and egress (pods \u2192 NAT Gateway, dashed).\n- Route edges around unrelated nodes (e.g., send api\u2192DB/cache/queue edges down one side) so lines don't cross over node boxes. Ensure edges + arrowheads stay within canvas bounds.\n\nDISTINGUISH NODE TYPES (use fill/color and reflect in legend)\n- Managed service (e.g., Route 53, WAF, ALB, NAT, RDS, ElastiCache, SQS): one distinct fill.\n- Compute / workload (pods, deployments, ingress controller): another fill.\n- Abstractions like K8s Service (ClusterIP): a third fill.\n\nLEGEND\n- Place a boxed legend in an empty corner (e.g., bottom-left) that fits fully on-canvas. Explain: managed-service fill, compute fill, abstraction fill, dashed outer box = cloud boundary, solid labeled boxes = VPC/subnet zones, solid arrow = request flow, dashed arrow = egress, entry-point icon = external internet.\n\nOUTPUT FORMAT\n- Markdown plan with sections: Layout & Title; Zones (groups[], nested, with x/y/w/h); Directed Edges (numbered); Legend; and a Fit/spacing notes section that includes the explicit bounds check (all nodes on-canvas, 0 overlaps, sized to one screen).", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_er-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'er-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each table/entity is a titled box listing its fields as rows\n- Fields show their data types and key fields (PK/FK) are marked\n- Edges between entities are labeled with cardinality (e.g. 1..*, places, has)\n- Relationship edges have visible arrowheads/markers\n- The schema is laid out top-to-bottom\n- Entity boxes do not overlap and edges do not cross through unrelated entity boxes\n- Every entity participates in at least one labeled relationship (no orphan tables)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_class-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'class-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each class is a titled box showing attribute rows and method() rows\n- Method members are visibly distinguished from attributes (e.g. trailing parentheses)\n- Inheritance is drawn with parents above children (BT orientation)\n- Edges show inheritance/association between classes with arrowheads\n- Class boxes do not overlap and edges do not cross through unrelated boxes\n- The parent-child hierarchy is readable at a glance (a clear tree, not a tangle)\n- Every class connects to at least one other (no orphan class)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_state-machine": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'state-machine'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each distinct state is a clearly labeled node\n- Every transition edge carries a label naming the triggering event/action\n- Every transition edge has a visible arrowhead showing direction\n- Terminal/final state(s) are visually distinguished (e.g. green/done tone)\n- An initial/entry state is identifiable as the starting point\n- The diagram is laid out top-to-bottom\n- No transition edge passes through an unrelated state node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_cloud-architecture": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'cloud-architecture' (a flow/topology diagram)\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou output a concise plain-text PLAN that names SPECIFIC content (real service names, real labels, real coordinates). Generic plans fail.\n\n## RUBRIC \u2014 satisfy EVERY criterion (each is scored)\n- Resources grouped into labeled zones: a dashed CLOUD BOUNDARY containing solid TIER boxes (e.g., Edge/Ingress, Compute, Messaging/Async, Data/Storage). Name the real provider/region (e.g., \"AWS Cloud \u2014 us-east-1\", \"GCP Project: prod-shop / us-central1\").\n- Each resource is a labeled node showing its concrete service AND a logical name (e.g., \"Lambda: OrderApi\", \"Cloud SQL: PostgreSQL (Regional HA)\", \"DynamoDB: Orders table\").\n- Directed edges show request/traffic flow; number each edge and give it a short label (e.g., \"publishes orders\", \"trigger/poll\", \"presigned PUT\").\n- Show the internet/user entry point AND the cloud boundary explicitly.\n- Distinguish managed services (DB/queue/cache/storage) from compute via fill tones.\n- Include a legend mapping tones/line-styles to meaning.\n- No overlaps; edges avoid unrelated nodes (route via gutters).\n\n## CRITICAL: GEOMETRY (this is where plans fail \u2014 do not repeat past mistakes)\nPast plans scored ~0.12 because nodes landed OFF-CANVAS even though the plan TEXT claimed \"bounds check \u2713\". Self-reported checks were false because declared zone heights did not match the coordinates used. Avoid this:\n\n- Canvas is exactly 1280\u00d7720, origin top-left. Usable area after 24px margins: x\u2208[24,1256], y\u2208[24,696]. EVERYTHING must fit inside this \u2014 nodes, zones, legend, title, and the external entry node.\n- For EVERY rectangle (zone, node, legend) compute right=x+w and bottom=y+h. ASSERT right\u22641256 and bottom\u2264696. A zone's declared h MUST be the same value you use everywhere; never write h=320 in the zone list then treat it as 600 in the check.\n- Every node must sit fully inside its parent tier box with \u22658px padding on all sides: node.x \u2265 zone.x+8, node.y \u2265 zone.y+8, node.right \u2264 zone.right\u22128, node.bottom \u2264 zone.bottom\u22128. Verify this for each node.\n- Tier boxes stacked vertically must not overlap: each tier.bottom + \u226524px gutter \u2264 next tier.y. Prefer STACKED horizontal tiers (full-width rows top\u2192bottom) over side-by-side columns \u2014 it keeps math simple and avoids off-canvas drift.\n- Place the legend OUTSIDE the cloud boundary (e.g., a right column at x\u22481056, w\u2248200, fully within x\u22641256) OR a bottom strip \u2014 never overlapping any zone/node.\n- The external user/internet node goes in the top margin band (y\u224824\u201380) above the dashed cloud boundary, horizontally centered.\n- Keep the design small enough to fit: with ~4 stacked full-width tiers inside a cloud box of roughly x=40,y=96,w=1000,h=576, each tier is ~110\u2013130px tall. Do NOT exceed these \u2014 shrink node sizes/counts before pushing past the bounds.\n- End the plan with a per-rectangle bounds table (x,y,w,h,right,bottom) and explicitly state each right/bottom passes. The numbers must be internally consistent with the zone/node lists above \u2014 if they disagree, fix the layout, not the check.\n\n## CRITICAL: COMPREHENSION (also scored ~0.07 \u2014 fix it)\nThe plan must read as a clear, followable explanation of the architecture, not just a coordinate dump.\n- Lead with a 2\u20133 sentence prose summary of the end-to-end flow in plain language (what the user does \u2192 how the request travels \u2192 where data lands), naming the real services.\n- Keep edge labels meaningful so a reader can trace the request/event path without guessing.\n- Order tiers in the actual direction of traffic (entry \u2192 edge \u2192 compute \u2192 messaging \u2192 data) so reading top-to-bottom = following the flow.\n- Use realistic, coherent service choices for the stated provider and use case; the topology must tell one consistent story (e.g., a single request path + one async fan-out), not a random pile of services.\n\n## OUTPUT FORMAT\n1. Title (centered, includes use case + audience/goal hint).\n2. One-paragraph flow summary (prose).\n3. Zones list (cloud boundary \u2192 tiers) with exact x,y,w,h and labels.\n4. Nodes per zone with exact x,y,w,h, label (service + name), and tone (compute vs managed vs abstraction).\n5. Numbered directed edges with labels and routing notes (which gutter).\n6. Legend (tones + line styles + entry icon), with its own coordinates.\n7. Bounds table verifying every rectangle: right\u22641256, bottom\u2264696, and every node inside its tier.\n\nBe concrete and keep the geometry self-consistent. A plan whose coordinates fit the canvas and whose flow is easy to follow beats a verbose one that claims to fit but does not." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_graph/report.md b/scripts/experiments/gepa-flowchart/overnight/topo_graph/report.md new file mode 100644 index 0000000..f603d22 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_graph/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: er-diagram, class-diagram, state-machine, cloud-architecture +shared skills: artifact_note, board_layout, graph_entity_lanes, graph_process_spine, graph_zoned_tiers +shared skills CHANGED by GEPA: graph_zoned_tiers + +**Seed 0.4615 -> Best 0.5894 (+0.1278)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_report/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo_report/best_topology_skills.json new file mode 100644 index 0000000..0702ac1 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_report/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "board_layout": "BOARD LAYOUT: lay the board top-to-bottom and state the direction; give it a title naming the subject AND its scope. Include EVERY element the rubric requires \u2014 keep each concise and sized so the whole board fits one screen without clipping (don't drop a required element to save space). Use real, specific values, never placeholders. For a multi-section board use panes: rows for a report read in order, a balanced 2x2 grid for a dashboard of peer tiles; title each pane.", + "report_rows": "REPORT LAYOUT: a cohesive top-down document read in order (rows), not a peer grid. Title each section and make sections visually distinct (heading + spacing). Embed REAL artifacts (a chart spec, a table, a map, an image) rather than describing them in prose.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_recipe": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'recipe'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A clear ingredients list with quantities\n- Numbered preparation steps in order\n- Prep/cook time and number of servings are shown\n- Difficulty or skill level is indicated\n- Nutrition info or chef tips are included\n- A visual/photo of the dish is included\n- Title names the dish\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_explainer": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'explainer'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- The top pane opens with a plain-language hook/surprising claim, not the abstraction\n- A flow pane shows the reasoning chain as color-coded nodes (postulate\u2192reasoning\u2192conclusion)\n- A vegalite pane shows the key quantitative relationship with a title saying what to notice\n- A component pane gives concrete numbers and a real-world anchor\n- A common misconception is addressed (an alert)\n- The panes are stacked top-down as a cohesive explainer (hook\u2192mechanism\u2192chart\u2192grounding)\n- Each layer is short, building intuition over symbols\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_algorithm-walkthrough": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'algorithm-walkthrough'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A markdown/code pane shows the annotated algorithm code\n- A component pane shows a step-by-step trace table of the state at each step\n- A vegalite pane shows a complexity curve for the algorithm\n- The three panes (code / trace / complexity) are visually distinct and labeled\n- The trace table rows are ordered by execution step\n- The complexity chart has a title and axis titles\n- Together the panes explain how the code works, not just show it\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_diy-project-plan": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'diy-project-plan'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A title plus badges (e.g. difficulty, time, cost) head the plan\n- A materials/tools table lists what's needed\n- An interactive checklist of build steps is present\n- At least one video tutorial reference card with a thumbnail is shown\n- A budget chart (VegaLite) breaks down the cost\n- The plan reads as a cohesive top-down document, not a grid of peers\n- A map or list of where to buy materials is included\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_report/report.md b/scripts/experiments/gepa-flowchart/overnight/topo_report/report.md new file mode 100644 index 0000000..48a1b04 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_report/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: recipe, explainer, algorithm-walkthrough, diy-project-plan +shared skills: artifact_note, board_layout, report_rows +shared skills CHANGED by GEPA: (none) + +**Seed 0.3233 -> Best 0.4286 (+0.1053)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_screen/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo_screen/best_topology_skills.json new file mode 100644 index 0000000..f61822f --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_screen/best_topology_skills.json @@ -0,0 +1,9 @@ +{ + "screen_frame": "TASK: You produce a BOARD PLAN (a written spec) for a UI mockup that another tool will render visually. Read the inputs (Journey, Use case, Reader, Goal) and output a clean, structured plan that renders as a real, polished app screen \u2014 NOT a diagram.\n\nTwo journey types:\n- \"app-screen-mockup (component)\" \u2192 ONE single mobile frame.\n- \"screen-set (panes)\" \u2192 MULTIPLE peer mobile frames in one titled set.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#1 PRIORITY \u2014 RESPECT THE MOBILE BUDGET (prevents overflow/clipping)\nContent overflow is the #1 scoring failure. A real phone frame (~390\u00d7844) fits far LESS than you think. BE RUTHLESS \u2014 underfill rather than overfill. Per frame, the TOTAL of all content must stay within:\n- Status bar + header (1 line title) \u2014 counts toward budget\n- THEN choose ONE of these content shapes, not several:\n \u2022 2 cards max (3 only if each is tiny), OR\n \u2022 2\u20133 list rows max, OR\n \u2022 1 short form (3 input fields max)\n- Plus ONE primary CTA.\nHARD CAPS: no more than ~7 total content blocks in a frame; no card with more than 4 inner lines; NEVER use multi-line \"Order summary\" / itemized receipts / long dotted-leader tables \u2014 these always clip. If the use case implies lots of detail (checkout, summary), SUMMARIZE aggressively (e.g. \"2 items \u00b7 $164.93\" on one row) instead of listing every line.\nA bottom tab bar is optional; if used it counts as a block.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#2 \u2014 REALISTIC, POLISHED APP LOOK (not a diagram)\n- Always include: a status bar (e.g. \"9:41\" + signal/wifi/battery), a top app bar/header with a real screen title, a primary content area, and ONE prominent CTA.\n- Use realistic placeholder content: real-sounding names, emails, prices, copy \u2014 never \"Lorem ipsum\" or \"Label 1\".\n- Consistent rounded styling: cards 12\u201316px radius, inputs 12px radius, buttons 10\u201312px radius; 16px side margins; 12px gaps between cards.\n- Light neutral background (#F5F6F8 or #F2F2F7), white cards, soft shadow, one accent color used sparingly.\n- Provide a font hierarchy note (header bold ~17pt, section labels small uppercase gray, body ~14px).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#3 \u2014 PRIMARY CTA MUST BE UNMISTAKABLE (rubric: primary_cta)\n- Exactly ONE dominant CTA per frame: full-width, solid accent-color fill, bold white text, rounded.\n- Make it the visually largest/boldest interactive element. Secondary actions must be plain text links, clearly subordinate (smaller, no fill).\n- Place it prominently (sticky bottom bar or near bottom of content).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#4 \u2014 FOR SCREEN-SETS (rubric: multiple_screens, realistic_screens, titled_set)\n- Give the whole set a clear TITLE BANNER at top (e.g. \"Sign-Up Flow \u2014 Acme App \u00b7 3 Steps\") and a caption above each frame (\"Step 1 of 3 \u00b7 Create Account\").\n- Lay frames LEFT-TO-RIGHT as peer tiles, connected by labeled arrows showing transition direction.\n- Each frame must be DISTINCT and individually realistic \u2014 different content/state per step (each obeying the mobile budget above), not near-duplicates. Show state progression (e.g. progress dots \u25cf\u25cb\u25cb \u2192 \u25cf\u25cf\u25cb \u2192 \u25cf\u25cf\u25cf, success-green CTA on a final/done screen).\n- Keep shared chrome consistent across frames (same phone outline, status bar, header style, input style, accent color).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\nOUTPUT FORMAT\n- Start with \"# BOARD PLAN: <title>\".\n- State orientation (single frame top-to-bottom, or set left-to-right) and global styling up front.\n- Number the sections of each frame top\u2192bottom.\n- End with a short hierarchy/consistency note.\n- Confirm the content fits one screen with no clipping (\"one screen, no overflow\").\n\nREMEMBER: A sparse, clean frame that clearly reads as a real app scores far higher than a dense, accurate-but-clipped one. When in doubt, cut content.", + "board_layout": "You produce a BOARD PLAN: a text spec describing a single-screen visual layout (design-system showcase, wireframe, dashboard, component kit, or report). Input gives Journey/use-case, Reader, and Goal.\n\nTOP PRIORITY \u2014 COMPREHENSION. The plan is scored mostly on how clearly a reader can understand WHAT is shown and WHY, not on how richly it is styled. Past plans lost almost all their score by drowning real information under styling chrome (hex codes, shadows, fonts, radii, status bars, \"9:41\", letter-spacing). Cut decoration; foreground meaning.\n\nDo this:\n1. TITLE & ORIENTATION: Give a title naming the subject AND its scope. State the layout direction (lay top-to-bottom by default; state it explicitly). One screen, no clipping \u2014 never drop a required element to save space; keep each element concise.\n\n2. LABEL PURPOSE, NOT JUST PARTS. For every region/section/card, write a short plain-English statement of what it is and what it communicates to the reader (e.g. \"HEADER \u2014 global nav + user menu\", \"LIST \u2014 table of records; tap = detail\"). These purpose annotations are the single biggest driver of comprehension \u2014 include one for each region. Make the function of each element obvious to someone skimming.\n\n3. REAL, SPECIFIC CONTENT over placeholders or abstract styling. Use concrete named values the reader can actually read and act on. Each component must be named WITH its state/usage (e.g. \"Text input (error): 'Password' + helper 'Min. 8 characters'\"). Prefer information the reader needs over visual flourish.\n\n4. STRUCTURE for the format:\n - Multi-section report \u2192 stacked rows read in order.\n - Dashboard of peer tiles \u2192 balanced 2\u00d72 grid.\n - Component/design-system showcase or wireframe \u2192 a single frame, sections top-to-bottom.\n Title each pane/section.\n\n5. STYLING IS MINIMAL & SUBORDINATE. State only the few styling facts that aid understanding (low-fi vs hi-fi, greyscale vs one accent, grid alignment). Do NOT spend the plan on exhaustive hex palettes, shadow specs, font point-sizes, device chrome, or repeated identical styling notes across frames. One brief global-style line is enough.\n\n6. COVER EVERY RUBRIC ELEMENT the use-case implies, each kept concise. For a form/input kit: fields, selects, toggles, validation/error states, button variants, labels. For a wireframe: header, nav/sidebar, content regions, list/table, primary CTA, footer \u2014 each as a labeled, annotated region. Keep all elements small enough to fit one screen.\n\n7. CLOSE with a one-line hierarchy/consistency note: name the dominant element and confirm the regions/sections are clearly distinguished and fit one screen.\n\nWrite the plan so a reader instantly grasps each region's content and purpose; treat visual styling as a thin supporting layer, never the focus.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_app-screen-mockup": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'app-screen-mockup'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Looks like a real polished app screen, not a diagram\n- A header / top app bar / nav is present\n- A clear primary content area (cards/list/feed/form) fills the screen\n- A prominent primary call-to-action button/action is present\n- Clear visual hierarchy via headings, sections and spacing\n- Realistic placeholder content (names, values, labels), not lorem stubs\n- Consistent polished styling (colors, spacing, rounded cards)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_wireframe": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'wireframe'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Low-fidelity look: greyscale boxes/placeholders, minimal color or imagery\n- Layout regions are blocked out (header, sidebar, content, footer)\n- Placeholder text/image blocks stand in for real content\n- Annotations/notes explain regions or interactions\n- Elements align to a clear grid/structure\n- Key UI elements (nav, buttons, inputs, lists) are indicated as boxes\n- Titled with the screen/page name\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_design-system": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'design-system'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A color palette with swatches (primary/secondary/semantic) is shown\n- A typography scale (headings/body) is shown\n- Core components shown in their variants (e.g. button states, badges)\n- Form elements (inputs, toggles, selects) are displayed\n- Spacing/grid or sizing tokens are documented\n- Each element/section is labeled with its name/usage\n- A consistent visual brand across the showcase\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_screen-set": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'screen-set'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Several distinct app screens are shown side by side\n- The screens tell a sequence/flow (step 1 \u2192 2 \u2192 3)\n- Each screen/frame has a title/caption\n- Each frame looks like a real screen, not a diagram\n- Consistent visual style across the frames\n- The flow/transition between frames is indicated\n- The set has an overall title naming the flow\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_screen/report.md b/scripts/experiments/gepa-flowchart/overnight/topo_screen/report.md new file mode 100644 index 0000000..6739614 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_screen/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: app-screen-mockup, wireframe, design-system, screen-set +shared skills: artifact_note, board_layout, screen_frame +shared skills CHANGED by GEPA: board_layout, screen_frame + +**Seed 0.5667 -> Best 0.6475 (+0.0808)** diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_xtype/best_topology_skills.json b/scripts/experiments/gepa-flowchart/overnight/topo_xtype/best_topology_skills.json new file mode 100644 index 0000000..61f4857 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_xtype/best_topology_skills.json @@ -0,0 +1,25 @@ +{ + "board_layout": "You are planning a single-screen visual board (dashboard, ER diagram, chart, report, or flow). Read the inputs: Journey (board type), Use case (subject + required elements), Reader (audience), and Goal (the question they need answered). Produce a complete board PLAN.\n\nCORE PRINCIPLE \u2014 OPTIMIZE FOR COMPREHENSION FIRST: The board must let the Reader achieve the stated Goal at a glance. Beyond listing elements, make the *answer/insight* explicit and visually dominant. State the takeaway the Reader is looking for in plain words (in the title, subtitle, or an annotation) \u2014 e.g. \"overfitting begins epoch 7\", \"bounce rate is the one red metric this week\". Annotate and highlight the data points that drive the conclusion. Do not just present data; interpret it for the Goal.\n\nLAYOUT:\n- Lay the board top-to-bottom and explicitly STATE the direction.\n- Give a title naming the SUBJECT and its SCOPE (time range, run id, version, dataset). Add a subtitle with the key context/takeaway.\n- For multi-section boards use titled panes: rows for a report read in order; a balanced 2x2 grid for a dashboard of peer tiles. Title every pane.\n- Size every element so the WHOLE board fits one screen without clipping. Never drop a required element to save space \u2014 keep each concise instead.\n\nCONTENT:\n- Include EVERY element the rubric/use case names. Enumerate them and self-check with a rubric checklist at the end confirming each is present.\n- Use real, specific values \u2014 never placeholders. Use realistic numbers, names, units, and types.\n- For charts: give clear axis titles WITH units, a titled legend, explicit data series with concrete values, and value labels / annotation rules at the points that answer the Goal.\n\nRENDERABLE FEATURES MUST ACTUALLY RENDER \u2014 claiming a feature in prose is not enough; specify it as a concrete rendered property:\n- ER / flow diagrams: every relationship edge must have a DIRECTIONAL ARROWHEAD (crow's-foot to the \"many\" side) \u2014 specify the arrowhead/marker explicitly on each edge, not just in prose. Every edge needs a verb label AND cardinality (e.g. 1 \u2192 0..*). Reinforce the top-to-bottom orientation by ordering tiers top\u2192bottom and giving y-coordinates that strictly increase down the flow; place parents above children. Resolve many-to-many through the join table.\n- Entity boxes: titled, with field rows showing name, type, and key markers (PK / FK\u2192Target.field / UNIQUE).\n- Position nodes with spacing that keeps boxes apart and routes edges through gaps, never through unrelated boxes. No orphan nodes.\n\nEMBEDDED SPECS (Vega-Lite, etc.):\n- If you include a JSON/code spec, it MUST be COMPLETE and valid \u2014 never truncate it. Close every brace and bracket. A cut-off spec will fail to render and is worse than a shorter complete one. Keep the spec compact enough to finish in full.\n- Use \"width\": \"container\" and a bounded height so it fits the card without bleed.\n- Include axis titles with units, a domain-scaled y-axis for legibility, a titled color legend, and annotation layers (e.g. dashed rule + label at the key event, text labels at final/peak points) that make the Goal's answer visible.\n\nClose with a short rubric-coverage checklist mapping each required element to where it appears.", + "report_rows": "REPORT LAYOUT: a cohesive top-down document read in order (rows), not a peer grid. Title each section and make sections visually distinct (heading + spacing). Embed REAL artifacts (a chart spec, a table, a map, an image) rather than describing them in prose.", + "dashboard_grid": "DASHBOARD LAYOUT: a balanced grid of peer tiles \u2014 include every panel the rubric names. Use stat cards (dim label / large value) with a polarity badge tied to a named threshold; set embedded charts' width to \"container\". Keep each tile concise; never drop a required panel to save space.", + "graph_entity_lanes": "GRAPH LAYOUT (entity/relationship): each entity is a type:\"entity\" node with its attributes in data.fields[] (one {name,type,key?} per row, mark PK/FK) and an explicit width/height \u2014 NEVER cram fields into data.label (that makes one giant overlapping box). Arrange entities in columns; draw each relationship as a labeled edge with cardinality (1, 0..1, 1..*) and an arrowhead. Every entity participates in at least one relationship.", + "screen_frame": "TASK: You produce a BOARD PLAN (a written spec) for a UI mockup that another tool will render visually. Read the inputs (Journey, Use case, Reader, Goal) and output a clean, structured plan that renders as a real, polished app screen \u2014 NOT a diagram.\n\nTwo journey types:\n- \"app-screen-mockup (component)\" \u2192 ONE single mobile frame.\n- \"screen-set (panes)\" \u2192 MULTIPLE peer mobile frames in one titled set.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#1 PRIORITY \u2014 RESPECT THE MOBILE BUDGET (prevents overflow/clipping)\nContent overflow is the #1 scoring failure. A real phone frame (~390\u00d7844) fits far LESS than you think. BE RUTHLESS \u2014 underfill rather than overfill. Per frame, the TOTAL of all content must stay within:\n- Status bar + header (1 line title) \u2014 counts toward budget\n- THEN choose ONE of these content shapes, not several:\n \u2022 2 cards max (3 only if each is tiny), OR\n \u2022 2\u20133 list rows max, OR\n \u2022 1 short form (3 input fields max)\n- Plus ONE primary CTA.\nHARD CAPS: no more than ~7 total content blocks in a frame; no card with more than 4 inner lines; NEVER use multi-line \"Order summary\" / itemized receipts / long dotted-leader tables \u2014 these always clip. If the use case implies lots of detail (checkout, summary), SUMMARIZE aggressively (e.g. \"2 items \u00b7 $164.93\" on one row) instead of listing every line.\nA bottom tab bar is optional; if used it counts as a block.\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#2 \u2014 REALISTIC, POLISHED APP LOOK (not a diagram)\n- Always include: a status bar (e.g. \"9:41\" + signal/wifi/battery), a top app bar/header with a real screen title, a primary content area, and ONE prominent CTA.\n- Use realistic placeholder content: real-sounding names, emails, prices, copy \u2014 never \"Lorem ipsum\" or \"Label 1\".\n- Consistent rounded styling: cards 12\u201316px radius, inputs 12px radius, buttons 10\u201312px radius; 16px side margins; 12px gaps between cards.\n- Light neutral background (#F5F6F8 or #F2F2F7), white cards, soft shadow, one accent color used sparingly.\n- Provide a font hierarchy note (header bold ~17pt, section labels small uppercase gray, body ~14px).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#3 \u2014 PRIMARY CTA MUST BE UNMISTAKABLE (rubric: primary_cta)\n- Exactly ONE dominant CTA per frame: full-width, solid accent-color fill, bold white text, rounded.\n- Make it the visually largest/boldest interactive element. Secondary actions must be plain text links, clearly subordinate (smaller, no fill).\n- Place it prominently (sticky bottom bar or near bottom of content).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\n#4 \u2014 FOR SCREEN-SETS (rubric: multiple_screens, realistic_screens, titled_set)\n- Give the whole set a clear TITLE BANNER at top (e.g. \"Sign-Up Flow \u2014 Acme App \u00b7 3 Steps\") and a caption above each frame (\"Step 1 of 3 \u00b7 Create Account\").\n- Lay frames LEFT-TO-RIGHT as peer tiles, connected by labeled arrows showing transition direction.\n- Each frame must be DISTINCT and individually realistic \u2014 different content/state per step (each obeying the mobile budget above), not near-duplicates. Show state progression (e.g. progress dots \u25cf\u25cb\u25cb \u2192 \u25cf\u25cf\u25cb \u2192 \u25cf\u25cf\u25cf, success-green CTA on a final/done screen).\n- Keep shared chrome consistent across frames (same phone outline, status bar, header style, input style, accent color).\n\n\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\u2550\nOUTPUT FORMAT\n- Start with \"# BOARD PLAN: <title>\".\n- State orientation (single frame top-to-bottom, or set left-to-right) and global styling up front.\n- Number the sections of each frame top\u2192bottom.\n- End with a short hierarchy/consistency note.\n- Confirm the content fits one screen with no clipping (\"one screen, no overflow\").\n\nREMEMBER: A sparse, clean frame that clearly reads as a real app scores far higher than a dense, accurate-but-clipped one. When in doubt, cut content.", + "comparison_grid": "COMPARISON LAYOUT: equal-width option cards side by side with IDENTICAL aligned rows; add a delta column (arrow + sign + value + %) where it applies; flag the best/recommended option with a badge (tone grey\u2192green). For products, show an image and an outbound link per option. Limit to the 3-4 most relevant options and keep each card compact (small image, ~4-5 spec rows) so the row fits on one screen without horizontal overflow.", + "graph_process_spine": "GRAPH LAYOUT (process flow): keep the happy path on one straight top-to-bottom spine; send branch/exception steps to the side with edges routed AROUND \u2014 never through \u2014 unrelated nodes. ~5-10 nodes. Label every edge with its trigger/condition and give it a visible arrowhead; color nodes by role via type:\"change\"+status and add a legend.", + "graph_zoned_tiers": "GRAPH LAYOUT (zoned architecture): group nodes into labeled zones/tiers via groups[]; show the external entry point and the request/data flow as directed edges between components; distinguish managed services from compute; add a legend. Keep zones from overlapping and route edges around unrelated nodes.", + "chart_internal": "CHART: emit a COMPLETE inline Vega-Lite spec (never a description). Title + subtitle; axis titles WITH units; set width to \"container\". Sort categories explicitly (usually descending). Put value labels on marks and a legend when there are multiple series; make the key insight obvious.", + "artifact_note": "Emit every required item as its OWN labeled, discrete element (own header / structured field / inline spec), never as prose \u2014 the grader scores the rendered image.", + "tail_state-machine": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'state-machine'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each distinct state is a clearly labeled node\n- Every transition edge carries a label naming the triggering event/action\n- Every transition edge has a visible arrowhead showing direction\n- Terminal/final state(s) are visually distinguished (e.g. green/done tone)\n- An initial/entry state is identifiable as the starting point\n- The diagram is laid out top-to-bottom\n- No transition edge passes through an unrelated state node\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_build-pipeline": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'build-pipeline'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each pipeline stage (build, test, scan, deploy) is a node\n- Stages are colored by status: passed=green, running=amber, failed=red\n- Stages connect in execution order with arrowheads\n- Stages show a duration or step detail in a meta/sub line\n- If a stage fails, it is clearly red and the downstream stages reflect being blocked/skipped\n- A legend maps status colors to passed/running/failed\n- No stage nodes overlap and edges do not pass through unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_user-flow": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'user-flow'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each node represents a screen/state of the app\n- Edges are labeled with the user action/trigger causing the transition\n- A clear entry/start screen is identifiable\n- Decision points (e.g. logged-in vs not, success vs error) branch\n- A primary happy-path is distinguishable\n- End/goal states are marked\n- Laid out top-to-bottom\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_er-diagram": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'er-diagram'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each table/entity is a titled box listing its fields as rows\n- Fields show their data types and key fields (PK/FK) are marked\n- Edges between entities are labeled with cardinality (e.g. 1..*, places, has)\n- Relationship edges have visible arrowheads/markers\n- The schema is laid out top-to-bottom\n- Entity boxes do not overlap and edges do not cross through unrelated entity boxes\n- Every entity participates in at least one labeled relationship (no orphan tables)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_cloud-architecture": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'cloud-architecture'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Resources are grouped into labeled zones (VPC, public/private subnets, tiers)\n- Each resource is a labeled node showing its service/type\n- Directed edges show the request/traffic flow between components\n- The internet/user entry point and the cloud boundary are shown\n- Managed services (DB, queue, cache) are distinguished from compute\n- A legend or tones explain the zone/resource types\n- Nodes and zones do not overlap; edges avoid unrelated nodes\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_recipe": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'recipe'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A clear ingredients list with quantities\n- Numbered preparation steps in order\n- Prep/cook time and number of servings are shown\n- Difficulty or skill level is indicated\n- Nutrition info or chef tips are included\n- A visual/photo of the dish is included\n- Title names the dish\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_shopping-comparison": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'shopping-comparison'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each shopping option is its own card in a comparison grid\n- Each option shows a prominent product image\n- Each option shows its price\n- The cheapest (or best-value) option is flagged with a badge\n- Each option lists key specs/features for comparison\n- Each option has an outbound buy/visit link\n- Each option shows a rating or review score\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_metrics-dashboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'metrics-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- A row of stat cards each shows a dimmed label and a large value\n- Stat cards carry status badges (color-coded) where relevant\n- An embedded trend chart (VegaLite) is present inside a card\n- The dashboard has a clear title heading\n- The embedded chart fits its card width (not bleeding outside)\n- Content is arranged in a clean top-down stack (title, stats row, chart)\n- All stat values and labels are legible and aligned\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_app-screen-mockup": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'app-screen-mockup'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Looks like a real polished app screen, not a diagram\n- A header / top app bar / nav is present\n- A clear primary content area (cards/list/feed/form) fills the screen\n- A prominent primary call-to-action button/action is present\n- Clear visual hierarchy via headings, sections and spacing\n- Realistic placeholder content (names, values, labels), not lorem stubs\n- Consistent polished styling (colors, spacing, rounded cards)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_training-curves": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'training-curves'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Separate train and validation curves are both plotted\n- X axis is the training step/epoch with a clear axis title\n- Y axis is the loss/metric with a clear axis title\n- The train/val divergence (overfitting gap) is visible and legible\n- The best checkpoint / early-stop point is marked (rule or point)\n- A legend distinguishes the train vs validation series\n- Chart has a descriptive title and subtitle naming the run\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_confusion-matrix": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'confusion-matrix'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- An N\u00d7N grid of actual (rows) vs predicted (columns) classes\n- Each cell shows its count or rate as a text label\n- Cells are color-encoded by value with a sequential scale\n- Both axes are labeled (actual vs predicted) with the class names\n- The correct-prediction diagonal is distinguishable from off-diagonal errors\n- A color legend/scale is present\n- Title names the classifier and dataset\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_observability-dashboard": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'observability-dashboard'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Panes are arranged as an independent-tile grid (a 2x2 mission-control board)\n- An SLO panel with ring progress and a status badge is present\n- A latency chart (e.g. p50/p95/p99) pane is present\n- A top-errors table and/or alert pane is present\n- A throughput chart pane is present\n- The grid mixes component panels and chart panels\n- Each chart pane has a title and axis titles\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "tail_travel-itinerary": "\u2014 NOW PLAN THIS BOARD \u2014\nBoard: 'travel-itinerary'\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nIt is judged against a strict rubric \u2014 satisfy EVERY criterion:\n- Each day of the trip is its own clearly labeled section/pane\n- Activities have times/order within each day\n- A map pane shows the destinations or route\n- A budget/cost element (chart or table) is included\n- Transport and accommodation logistics are noted\n- Must-see highlights are flagged\n- Title names the trip (destination + duration)\n\nApply the layout rules above. Output a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/topo_xtype/report.md b/scripts/experiments/gepa-flowchart/overnight/topo_xtype/report.md new file mode 100644 index 0000000..1d2ea4c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/topo_xtype/report.md @@ -0,0 +1,7 @@ +# Joint topology-skill run + +journeys: state-machine, build-pipeline, user-flow, er-diagram, cloud-architecture, recipe, shopping-comparison, metrics-dashboard, app-screen-mockup, training-curves, confusion-matrix, observability-dashboard, travel-itinerary +shared skills: artifact_note, board_layout, chart_internal, comparison_grid, dashboard_grid, graph_entity_lanes, graph_process_spine, graph_zoned_tiers, report_rows, screen_frame +shared skills CHANGED by GEPA: board_layout + +**Seed 0.5080 -> Best 0.5875 (+0.0795)** diff --git a/scripts/experiments/gepa-flowchart/overnight/type_calltree/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/type_calltree/best_prompts.json new file mode 100644 index 0000000..05b1305 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_calltree/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Inputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nGOAL\nProduce a concise plain-text plan for a CALL HIERARCHY / call-tree diagram (top-down flow). The plan tells a diagram-builder exactly what nodes, calls, branch conditions, and outcomes to render so a first-time reader can answer specific factual questions directly from the diagram.\n\nWHAT TO OUTPUT\n1. Diagram type: one line (call hierarchy / decision flow, top-down, single screen).\n2. Entry point(s): name the actual trigger function/event, its file/component, and the concrete trigger condition (e.g., interval, threshold, event).\n3. Call hierarchy: for each step list the function/step, the file/component in parentheses when known, each function it calls, the branch conditions, and the concrete outcome/terminal state. If this is a diff, mark each call as added/removed/modified.\n4. Branch conditions: state the actual comparison values and named constants (e.g., quorum = 3, CPU > 80% for 5min, replicas < 20).\n5. Terminal outcomes: name each end state explicitly.\n6. \"Reader Can Answer\" section: list the specific factual questions the diagram lets a reader answer, each with the concrete answer.\n\nBE CONCRETE \u2014 THIS IS THE TOP PRIORITY\nGeneric plans fail. Always name real values, actors, conditions, and outcomes: actual function names, file paths, numeric thresholds, named constants, version strings, status codes, timeouts, and terminal states. Invent plausible, specific, domain-correct values when the input does not supply them.\n\nLEGIBILITY & LAYOUT RULES (the main source of low scores \u2014 follow strictly)\nThe feedback consistently penalized truncated/clipped/crowded text. The diagram renders on ONE screen and long text gets cut with ellipses or clipped at the right/bottom edge. To prevent this:\n- Keep EVERY label and annotation SHORT \u2014 aim for under ~40 characters per line item. No long sentences as annotations.\n- Do NOT attach long right-side annotations or trailing notes to nodes; they get truncated. Put detail in compact form on its own line under the node instead.\n- Move file paths OFF the node labels. List file/component info compactly and only when it adds value; prefer short component names over full paths (e.g., `scale.go` not `autoscaler-svc/scale.go` if space is tight).\n- Limit the main vertical spine to ~6 nodes maximum to avoid \"deep-tree\" lint warnings. Flatten or group where possible.\n- Keep branch conditions terse: `reachableCount < 3 \u2192 quorum lost` rather than a full clause.\n- Reserve a small margin; do not place content at the extreme right or bottom edge.\n- Put any tabular/legend material in a compact, narrow form so it does not run off the bottom or right.\n- Prefer fewer, denser-but-short items over many crowded annotations.\n\nSTRATEGY\n- Identify the single real entry trigger and follow the call chain top-down.\n- At each call, capture: callee name, location (short), branch test with real values, and resulting outcome/state.\n- Ensure the set of branches and terminal states covers the factual questions implied by the Purpose.\n- Note whether this is an initial plan or a diff; if a diff, annotate added/removed/modified calls.\n\nOutput concise plain text only.", + "generate": "You are generating a `calltree` diagram as a single JSON object that will be rendered visually (as a tree/call-hierarchy graphic). Your output is fed directly to a renderer, so layout and text-length constraints matter as much as correctness.\n\n## Inputs you will receive\n- `skill_context`: background on the diagramming skill/tooling.\n- `Topic`, `Audience`, `Purpose`: framing for the diagram.\n- `Plan`: a detailed \"Diagram Plan\" describing an entry point, a call/step hierarchy, branches, loops, terminal states, and render notes. Follow this plan faithfully \u2014 it is the source of truth for nodes, labels, file/component names, branch conditions, and any callouts.\n\n## Output format\nOutput ONLY a single JSON object, nothing else. Structure:\n```\n{\n \"roots\": [\n {\n \"label\": \"string (required)\",\n \"file\": \"string (optional \u2014 component/file name)\",\n \"meta\": \"string (optional \u2014 short annotation)\",\n \"change\": \"added | modified | removed (optional)\",\n \"children\": [ ... same node shape ... ]\n }\n ]\n}\n```\n- `roots` is an array (use multiple roots only when the plan clearly has separate top-level flows, e.g. a main flow plus a parallel watcher or recovery paths).\n- Label every node specifically: name the function/step AND what it does (e.g. `assignPriority() \u2014 Impact \u00d7 Urgency matrix`).\n- Use `file` for the component/file/module that owns the call when the plan provides it.\n- Use `meta` for branch conditions, end-states, callouts, or back-edge notes.\n- Set `change` (\"added\"/\"modified\"/\"removed\") only where the plan indicates something changed.\n\n## CRITICAL: avoid clipping/truncation (the #1 recurring failure)\nThe renderer truncates long text with ellipses, which severely hurts readability and scores. To prevent this:\n- Keep every `label` short \u2014 aim for under ~55 characters. Prefer `funcName() \u2014 brief purpose` and cut filler words.\n- Keep every `meta` annotation very short \u2014 aim for under ~40 characters. Do NOT pack multiple facts, full sentences, or long condition descriptions into one `meta`. Split or shorten.\n- Never put long phrases like full SLA tables, multi-clause conditions, or \"Back-edge: X \u2192 Y\" explanations as a single annotation; abbreviate (e.g. `back-edge \u2192 escalate`, `breach \u2192 +1 tier`).\n- Avoid relying on right-side annotations to carry essential meaning, since those are the first to be cut off. Put essential info in the (also short) label.\n- Do not let bottom rows overflow: keep total node count modest so the whole tree fits on one screen.\n\n## CRITICAL: keep the tree shallow and compact (avoid deep-tree lint warning)\n- Keep depth modest \u2014 prefer a maximum of about 4\u20135 levels. Deeply nested chains trigger a `deep-tree` warning and read poorly.\n- Respect the plan's node-count hint (e.g. \"~9 nodes\", \"~14 nodes\"); collapse or flatten where possible.\n- Represent forward chains (e.g. Tier1 \u2192 Tier2 \u2192 Tier3) and back-edges (loops, re-escalation, recovery) without deeply nesting every step; use sibling nodes plus short `meta` notes (e.g. `\u2192 next step`) instead of long parent-child chains where it reduces depth.\n- Collapse parallel watchers or side processes into a separate compact root with a brief annotation rather than threading them through the main tree.\n\n## Content guidance\n- Show branch conditions concisely (e.g. `BRANCH: stock >= qty?`, `YES \u2192 allocate`).\n- Mark terminal/end states clearly and briefly (e.g. `END: PICKING`, `END: ON_HOLD`).\n- Mark special callout points briefly (e.g. `*** COMMIT POINT ***`).\n- When the plan asks for distinct styling/colors, note it compactly in `meta` (e.g. `END (green)`).\n- Use short arrow references (`\u2192 Coordinator.complete()`) for shared callees instead of duplicating whole subtrees.\n\nOutput ONLY the JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/type_calltree/report.md b/scripts/experiments/gepa-flowchart/overnight/type_calltree/report.md new file mode 100644 index 0000000..2179dd7 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_calltree/report.md @@ -0,0 +1,3 @@ +# GEPA calltree + +**Seed 0.7029 → Best 0.7553 (+0.0525)** diff --git a/scripts/experiments/gepa-flowchart/overnight/type_component/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/type_component/best_prompts.json new file mode 100644 index 0000000..c41c18b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_component/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a single-screen dashboard or report board. You will receive:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nGOAL\nProduce a concise plain-text plan describing exactly what the board renders, so a first-time reader can answer specific factual questions (named values, actors, conditions, outcomes) directly from it. Generic plans fail \u2014 always write the actual concrete values, not descriptions of what values would go there.\n\nCRITICAL RENDERING CONSTRAINTS (these caused past failures \u2014 obey them):\n1. DO NOT use \"Stat\" / \"stat card\" components. They render as broken \"[unknown component: Stat]\" placeholders. Put quick-glance key/value summaries in a small TABLE instead (e.g., a 2-column \"Metric | Value\" table, or a single-row table of headline numbers).\n2. EVERY section must contain real, renderable content INLINE in the plan \u2014 never an empty container, a label like \"(the core list)\", or a section that just names what it would hold. If a section would be a list of steps, write all the steps out fully. Empty/collapsed boxes are the most common failure; eliminate them.\n3. PREFER TABLES for almost everything (key/value summaries, rule mappings, ordered steps, actor\u00d7condition\u2192outcome matrices). Tables render reliably and densely. Use short badge rows only for a handful (3\u20135) of terminal states/outcomes, and keep them brief.\n4. FIT ON ONE SCREEN WITHOUT CLIPPING. The board must not extend off the canvas. Cap the total at 4\u20135 sections. Do not place a badge/outcome row as the final section if it risks being cut off \u2014 if you must include outcomes, fold them into a table column or place them earlier. Keep each table to ~5 rows and ~5 columns max.\n\nLAYOUT DECISION\n- Report (process/decision narrative, ordered reading) \u2192 top-down Stack.\n- Dashboard (peer status tiles, no strict order) \u2192 SimpleGrid of peer tiles.\nState which you chose and why in one line at the top.\n\nWHAT TO DECIDE AND OUTPUT\n- Sections (4\u20135 max), each with a clear heading.\n- For each section: its component type (almost always a table; occasionally a short badge row), and the FULL concrete contents \u2014 every column header and every row value spelled out with real names, numbers, thresholds, timeouts, versions, actors, and outcomes.\n- A one-line reading order.\n- A final line listing 3\u20134 specific factual questions a reader can now answer, each with the exact answer drawn from the board (proves the data is present).\n\nSTYLE\n- Concise plain text with markdown headings and real markdown tables.\n- Use realistic, internally consistent concrete values (e.g., specific node counts, epoch numbers, CPU %, thresholds, version tags, engineer names, runbook IDs).\n- Color/severity hints (red/amber/green) may be noted inline but are secondary; never let styling substitute for actual content.\n\nRemember: a section is only useful if it renders with visible content. When in doubt, fewer sections, each a full table, no Stat cards, nothing at the bottom that could clip.", + "generate": "{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single `component` node tree. Use Title/Card/Table/Stat/Badge to give clear visual hierarchy; fill in the concrete values; keep nesting shallow so it isn't cramped. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/type_component/report.md b/scripts/experiments/gepa-flowchart/overnight/type_component/report.md new file mode 100644 index 0000000..7aa169d --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_component/report.md @@ -0,0 +1,3 @@ +# GEPA component + +**Seed 0.4363 → Best 0.4840 (+0.0476)** diff --git a/scripts/experiments/gepa-flowchart/overnight/type_panes/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/type_panes/best_prompts.json new file mode 100644 index 0000000..bbd747c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_panes/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Inputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan what the multi-pane view must show so a first-time reader can answer specific factual questions from it. Be concrete: name the actual values, actors, conditions, and outcomes \u2014 generic plans fail. Keep it within a size that renders cleanly on one screen. Output a concise plain-text plan.\nDecide the layout (rows=report, grid=dashboard, columns=compare), the 2\u20134 panes, each pane's type and what it shows, and how they relate. Keep each pane self-contained.", + "generate": "{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single `panes` JSON object with 2\u20134 well-chosen panes, each a valid object of its declared type. Pick the layout that fits the reading order. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/type_panes/report.md b/scripts/experiments/gepa-flowchart/overnight/type_panes/report.md new file mode 100644 index 0000000..5183630 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_panes/report.md @@ -0,0 +1,3 @@ +# GEPA panes + +**Seed 0.0000 → Best 0.0000 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/type_panes_fixed/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/type_panes_fixed/best_prompts.json new file mode 100644 index 0000000..bbd747c --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_panes_fixed/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Inputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan what the multi-pane view must show so a first-time reader can answer specific factual questions from it. Be concrete: name the actual values, actors, conditions, and outcomes \u2014 generic plans fail. Keep it within a size that renders cleanly on one screen. Output a concise plain-text plan.\nDecide the layout (rows=report, grid=dashboard, columns=compare), the 2\u20134 panes, each pane's type and what it shows, and how they relate. Keep each pane self-contained.", + "generate": "{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single `panes` JSON object with 2\u20134 well-chosen panes, each a valid object of its declared type. Pick the layout that fits the reading order. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/type_panes_fixed/report.md b/scripts/experiments/gepa-flowchart/overnight/type_panes_fixed/report.md new file mode 100644 index 0000000..2b42ac8 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_panes_fixed/report.md @@ -0,0 +1 @@ +**Seed 0.0000 → Best 0.0000 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/type_vegalite/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/type_vegalite/best_prompts.json new file mode 100644 index 0000000..fdaeaf3 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_vegalite/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Inputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nYou are planning ONE chart that will be rendered in VEGA-LITE. Output a plain-text plan only.\n\n==================================================================\nABSOLUTE HARD RULE \u2014 READ FIRST\n==================================================================\nVega-Lite is a QUANTITATIVE/STATISTICAL charting grammar. It CANNOT render\nflowcharts, decision trees, swimlanes, swim lanes, lanes, directed graphs,\nnode-edge diagrams, \"fan-out\" branches, decision diamonds, start/end nodes,\nor connector arrows. Every past attempt that planned any of these scored\n~0.05\u20130.21: it produced only an error message + raw JSON, or clipped\ndisconnected boxes with no arrows.\n\nTherefore:\n- The word \"flowchart\" / \"decision tree\" / \"swim lane\" / \"node\" / \"diagram\"\n MUST NOT appear in your chosen chart type. If your first instinct is a flow\n or process diagram, you MUST reframe it into a real x/y chart below.\n- Even if the Topic is a \"workflow\", \"process\", \"routing\", \"triage\",\n \"decision\", \"branching\", or \"check sequence\", you reframe it into a\n quantitative or categorical COMPARISON. There are no exceptions.\n\n==================================================================\nHOW TO REFRAME (process/decision \u2192 real chart)\n==================================================================\nPick the closest pattern:\n- Steps/stages with numeric thresholds (seconds, %, count, ms, MB) \u2192\n HORIZONTAL BAR: y = step name (ordinal), x = numeric threshold (quantitative),\n color = outcome category.\n- Input scenario \u2192 resulting value/version/tier \u2192\n GROUPED/STACKED BAR: x = scenario (nominal), y = resulting numeric value,\n color = outcome type. Every bar text-labeled.\n- A sequence over time/latency \u2192 bar or point: x = time/seconds, y = stage.\n- Per-region / per-case computed amounts (e.g. price, tax, FX, SLA) \u2192\n HORIZONTAL or GROUPED BAR: y/x = case, value = the computed number,\n color = category. (Example 1's regions \u2192 grouped bar of final total per region.)\n- Routing/triage with SLAs \u2192 HORIZONTAL BAR: y = priority/queue, x = SLA minutes,\n color = tier. (Example 3 \u2192 bar of P1\u2013P4 vs SLA minutes, color by tier.)\n- Pass/reject gates with thresholds \u2192 HORIZONTAL BAR: y = check name,\n x = threshold value, color = pass/reject. (Example 2 \u2192 bar of each check's\n numeric threshold: freshness 60 min, null 0.5%, PSI 0.10, mismatch 1%, p99 8 ms.)\n\nPrefer HORIZONTAL bars whenever category names are long (more room for text).\n\n==================================================================\nALLOWED MARK TYPES ONLY\n==================================================================\nbar, line, point/scatter, area, tick, rect (heatmap), text.\n- Every meaningful data point MUST have an inline text label (add a text layer).\n- point/scatter is allowed ONLY if both axes encode real numeric values; never\n use dots as decoration.\n\n==================================================================\nYOUR PLAN MUST SPECIFY ALL OF THESE (numbered 1\u20139)\n==================================================================\n1. The single factual question the chart answers (one sentence).\n2. Mark type (from allowed list ONLY) and why it fits. Must NOT be a flow/diagram.\n3. X field: name, data type (quantitative/ordinal/nominal/temporal), and explicit\n UNITS (seconds, %, pods, version number, count, minutes, ms, currency).\n4. Y field: name, data type, and explicit UNITS.\n5. X-axis title AND Y-axis title \u2014 STATE THEM EXPLICITLY as quoted strings.\n NEVER omit either, even for a categorical axis. (A \"no-axis-title\" lint warning\n fired on every failed example \u2014 always include both.)\n6. Series/color split: the field, its categories, and the exact color mapping\n (e.g., green=safe/pass, orange=step-down, red=halt/reject, blue=process).\n Keep to \u22645 colors.\n7. Scale: linear or log, and the numeric domain/range (e.g., 0\u2013100%, 0\u201360 s,\n 0\u2013500 min).\n8. Headline (chart title): specific, naming the key numbers/thresholds \u2014 not generic.\n9. Representative inline data points: list the ACTUAL rows that will appear, each\n with concrete values AND the short text-label string that renders next to it.\n Provide 5\u201312 labeled rows.\n\n==================================================================\nLEGIBILITY & CLUTTER RULES (these drove the low scores)\n==================================================================\n- Keep total marks to ~6\u201312 so nothing truncates with \"...\" on one screen.\n- Keep every label \u2264 ~20 chars so it fits without overflow; abbreviate long names\n and give the full form in the headline or a caption.\n- Avoid empty space: choose a layout where bars/points fill the plotting area.\n- Put each value as a data label adjacent to its mark \u2014 never on connectors or\n tiny shapes.\n- Use a single clear primary ordering (sort bars by value or by stage) so the\n hierarchy is obvious.\n- Ensure the plan fits within a fixed container width so nothing clips on the\n right edge.\n\n==================================================================\nPRESERVE ALL CONCRETE DOMAIN VALUES\n==================================================================\nKeep every threshold, interval, version number, count, actor name, rate, SLA,\nand outcome from the inputs. Generic plans fail \u2014 name the actual values,\nactors, conditions, and outcomes inside the data rows and headline.\n\n==================================================================\nOUTPUT\n==================================================================\nA concise plain-text plan covering items 1\u20139. No JSON. Keep it to a size that\nrenders cleanly on one screen.", + "generate": "{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single Vega-Lite spec. Inline realistic `data.values`; give it title+subtitle, axis titles, tooltip, width:\"container\". Pick the mark/scale that makes the trend or comparison legible without crowding. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/type_vegalite/report.md b/scripts/experiments/gepa-flowchart/overnight/type_vegalite/report.md new file mode 100644 index 0000000..fbe3967 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/type_vegalite/report.md @@ -0,0 +1,3 @@ +# GEPA vegalite + +**Seed 0.3871 → Best 0.4322 (+0.0452)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_app-screen-mockup/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ux_journey_app-screen-mockup/best_prompts.json new file mode 100644 index 0000000..ff77303 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_app-screen-mockup/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan an 'app-screen-mockup' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nYou are planning ONE single polished app screen (mobile frame, e.g. 390\u00d7844). The plan is rendered literally, so the MOST COMMON failure is CONTENT OVERFLOW/CLIPPING. Treat screen space as a strict budget \u2014 it is far better to under-fill than to overflow.\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Looks like a real polished app screen, not a diagram\n- A header / top app bar / nav is present\n- A clear primary content area (cards/list/feed/form) fills the screen\n- A prominent primary call-to-action button/action is present\n- Clear visual hierarchy via headings, sections and spacing\n- Realistic placeholder content (names, values, labels), not lorem stubs\n- Consistent polished styling (colors, spacing, rounded cards)\n\nHARD CONSTRAINTS TO AVOID OVERFLOW (these are the #1 cause of low scores):\n- Budget for a single mobile screen. Include at MOST 2\u20133 content cards/sections total in the primary area.\n- Lists/feeds: MAX 3 items. KPI rows: MAX 3 stat cards. Tables: MAX 3\u20134 rows.\n- Do NOT stack header + sidebar + multiple rows + bottom bar + pinned CTA all at once. Pick a lean layout that fits.\n- Keep every text string short (labels, not paragraphs). No long body copy.\n- Reserve clear vertical room for the CTA so it is never clipped.\n\nPRIMARY CTA (a frequent zero-score failure \u2014 make it unmissable):\n- Exactly ONE dominant primary action, full-width, filled with the accent color, white bold text, ~52px tall, 12px radius.\n- State its exact label (e.g. \"Place Order\", \"Follow\"), its position (pinned bottom bar OR inline at top of content), and that it visually dominates all other buttons.\n- Any secondary buttons must be clearly subordinate (outline/ghost style).\n\nCONSISTENT STYLING (state these EXPLICITLY and reuse them everywhere):\n- Define ONE accent color (hex), background color (hex), card color (hex), and 2 text colors (hex).\n- Define ONE corner radius (e.g. 12px), ONE padding/gutter value (e.g. 16px), and a 3\u20134 step type scale (e.g. 24/18/15/13).\n- Apply the SAME accent to links, active nav, selected states, and the CTA. Apply the SAME radius/padding to every card. Call this out so consistency is unambiguous.\n\nCOMPREHENSION / CLARITY (keep the plan easy to parse):\n- Use numbered sections mapped to the rubric criteria, in this order: Overall Style \u2192 Header/Nav \u2192 Primary Content Area \u2192 Primary CTA \u2192 Visual Hierarchy \u2192 Realistic Content \u2192 Consistent Styling.\n- Be specific and concrete; generic plans fail. Every label, value, name, price, date, and count must be a realistic literal string (e.g. \"Jordan Mitchell\", \"$122.04\", \"Last 30 days\", \"48.6K Followers\") \u2014 never lorem ipsum or placeholder stubs.\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion, while respecting the overflow budget above.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_app-screen-mockup/report.md b/scripts/experiments/gepa-flowchart/overnight/ux_journey_app-screen-mockup/report.md new file mode 100644 index 0000000..ac3590b --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_app-screen-mockup/report.md @@ -0,0 +1,5 @@ +# GEPA journey: app-screen-mockup + +rubric criteria: 7; val: 2 + +**Seed 0.5689 → Best 0.6133 (+0.0444)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_design-system/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ux_journey_design-system/best_prompts.json new file mode 100644 index 0000000..d9b655a --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_design-system/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a single \"design-system\" board (one fixed-size canvas/artboard). Inputs:\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nCRITICAL CONSTRAINT \u2014 FIT ON ONE BOARD WITHOUT OVERFLOW:\nThe #1 failure mode is content overflowing/clipping the board, which causes whole sections to be cut off and scored as MISSING. A shorter plan where every section fully fits beats a rich plan that overflows. Be ruthlessly concise. Do NOT exhaust every variant or token \u2014 show a representative, minimal-but-complete set. When in doubt, cut content.\n\nThe board is judged against a strict rubric. EVERY criterion below must be visibly present AND fit:\n1. Color palette swatches: primary, secondary, semantic (with hex + token name).\n2. Typography scale: headings + body rows (size/weight).\n3. Core components in variants (e.g., button states, badges).\n4. Form elements: inputs, toggles, selects.\n5. Spacing/grid or sizing tokens documented.\n6. Each element/section labeled with name/usage.\n7. Consistent visual brand across the showcase.\n\nHARD LIMITS (to guarantee fit \u2014 do not exceed):\n- Exactly 6 sections, in this fixed order: (1) Brand Header, (2) Color, (3) Typography, (4) Components, (5) Form Elements, (6) Spacing/Grid.\n- Color: max 3 primary, 2 secondary, 4 semantic, 4 neutrals. One line each.\n- Typography: max 5 rows (e.g., H1, H2, H3, Body, Caption). One line each.\n- Components: Buttons with 4 states max (Default/Hover/Disabled/Loading) \u00d7 2\u20133 variants; plus 1 badge row (3\u20134 badges). Nothing more.\n- Form Elements: exactly 4 \u2014 Text input (2 states max), Select, Toggle, Checkbox. No textarea/radio/search/stepper.\n- Spacing/Grid: one spacing scale (5\u20136 steps), one grid spec line, one radius token line. No elevation section.\n\nLAYOUT (state it explicitly so nothing clips):\n- Specify a 2- or 3-column grid that places each of the 6 sections in its own bordered card with uniform padding and gutters.\n- Keep every card's content to a few short lines; assume limited vertical space per card.\n- Name a single brand: one typeface, one primary hex accent, one corner-radius, applied to header + all cards.\n\nOUTPUT FORMAT:\n- Concise plain text. Use short labeled lines, not paragraphs.\n- Give SPECIFIC real values (hex codes, px sizes, weights, token names, sample labels). Generic plans fail.\n- Every swatch/row/component/field must carry a visible name/token + brief usage.\n- Prioritize completeness of all 7 criteria over depth in any one. Total plan should be brief enough that all 6 sections clearly fit on one board.", + "generate": "You generate a single valid JSON object describing a UI component tree (Mantine-style schema) that renders a \"Design-System Board\" from a provided plan.\n\n## INPUT FORMAT\nYou receive:\n- skill_context: general framing\n- Use case / Reader / Goal: meta-context (typically a design-system reference board for UX/designers)\n- Plan: a markdown spec with BRAND, LAYOUT, and 6 numbered SECTIONS (Brand Header, Color, Typography, Components, Form Elements, Spacing/Grid). The plan names SPECIFIC values: typeface, hex colors + token names + usage, type scale rows, button variants/states, form element states, spacing scale, grid spec, radius scale.\n\n## OUTPUT\nOutput ONLY one valid JSON object (the component tree). No prose, no markdown fences. Bake in the SPECIFIC literal values from the plan (every hex, token, px, label) \u2014 never use placeholders.\n\n## SCHEMA (component types available)\nNodes are `{\"type\": ..., \"props\": {...}, \"children\": ...}` where children is a string, an object, or an array of nodes.\nKnown types: Stack, Group, SimpleGrid, Card, Paper, Box, Title, Text, Badge, Table, Divider, Anchor, Code, List, List.Item.\n- Title props: `order` (1-3). Text props: `size` (\"xs\"/\"sm\"/\"md\"/\"lg\"/\"xl\"), `c` (color or \"dimmed\"), `fw` (weight).\n- Card/Paper props: `withBorder`, `radius`, `p`. Use numeric px or size tokens consistently.\n- SimpleGrid props: `cols`, `spacing`, `verticalSpacing`.\n- Table props: `data: {head: [...], body: [[...], ...]}`.\n- Box can be used as a color swatch via `w`, `h`, `bg`/`backgroundColor`, `style.borderRadius`.\n\n## CRITICAL: THE RUBRIC IS SCORED ON THESE CRITERIA \u2014 SATISFY ALL FULLY\nThe weakest scores in past attempts were form_elements, spacing_or_grid, and component_variants. These FAIL when content is merely described in text instead of being visually rendered as distinct elements. To score full marks:\n\n1. **component_variants** \u2014 Render every button variant \u00d7 state as a SEPARATE visible chip/box with its OWN distinct styling. Do NOT collapse states into generic gray badges. Apply the exact plan hex per state: Default uses the primary hex, Hover uses the hover hex, Disabled uses a muted/translucent style, Loading shows a spinner glyph (\u27f3/\u25cc). Secondary buttons must use outline/variant styling distinct from primary. Render Badges as a separate labeled row with their semantic colors.\n\n2. **form_elements** \u2014 Render each form element as an actual VISUAL mock, not a list item or plain sentence. For EACH of the 4 elements:\n - Text input: render TWO bordered Box mockups \u2014 one Default (1px border in the neutral border hex, e.g. #E2E8F0) and one Focus (1px border in the primary hex, e.g. #2563EB), each containing placeholder text and showing the \"Email\" label above.\n - Select: a bordered Box showing the option text with a \u25be caret.\n - Toggle: render distinct On (pill in primary hex) and Off (pill in neutral/gray hex) visual states with the label.\n - Checkbox: render distinct checked (\u2611/\u2713 in primary hex) and unchecked (\u2610) states with the label.\n Each element needs its own label and visible boxed representation.\n\n3. **spacing_or_grid** \u2014 Render the spacing scale as a Table (token | px) AND show visual proportion where possible; also render Grid spec and Radius scale (as a small table or labeled badges). Include every token/px pair from the plan. Missing the table or any value zeroes this criterion.\n\n## CRITICAL: GEOMETRY \u2014 CONTENT MUST NOT OVERFLOW/CLIP\nEvery past attempt clipped content. Prevent this:\n- Do NOT pack many long Badges into a single non-wrapping Group. Long text inside fixed chips overflows. Keep badge/chip text SHORT (e.g. \"Default\", \"#2563EB\", a token name) \u2014 never cram \"hex \u00b7 token \u00b7 usage\" into one badge.\n- For dense data (color tables, type scale, spacing scale), prefer a Table OR a vertical Stack of compact rows (small swatch Box + short Text) rather than wide horizontal Groups of large badges.\n- Keep the layout within a 3-column SimpleGrid; account for limited card width. Use compact sizes (`size: \"sm\"`/\"xs\"), tight gaps, and let rows stack vertically inside cards.\n- The Color section is the densest \u2014 render it as a vertical Stack of rows, each row = small swatch Box (w/h ~20) + short Text (\"#HEX \u00b7 token \u00b7 use\"). Do not use giant badges that exceed card width.\n\n## LAYOUT STRATEGY (follow the plan's LAYOUT)\n- Top: full-width Brand Header (Paper/Card) \u2014 title, subtitle, and a brand swatch/version chip. Often spans all 3 columns as a separate node above the grid.\n- Below: a SimpleGrid with cols=3 containing the 6 cards (Color, Typography, Components, Form Elements, Spacing/Grid, and a 6th card such as a Legend/summary if the plan implies one).\n- Apply BRAND globally: every card withBorder=true, border 1px in the neutral border hex, radius matching the plan (e.g. 8px), uniform 16px padding, 16px gutters, and the named typeface in style.fontFamily.\n- Give each card a Title (order 2 or 3) matching its section name.\n\n## CHECKLIST BEFORE OUTPUT\n- All 6 sections present, each in its own bordered card.\n- Every literal value from the plan baked in (all hexes, tokens, px, labels, states).\n- Form elements rendered as visual mocks (default+focus input, select, toggle on/off, checkbox checked/unchecked).\n- Button variants\u00d7states each visually distinct with correct per-state colors; badges row present.\n- Spacing table + grid + radius all present with exact values.\n- No overcrowded Groups; short chip text; dense data in tables/stacks to avoid clipping.\n- Valid JSON, single object, output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_design-system/report.md b/scripts/experiments/gepa-flowchart/overnight/ux_journey_design-system/report.md new file mode 100644 index 0000000..b1d4edd --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_design-system/report.md @@ -0,0 +1,5 @@ +# GEPA journey: design-system + +rubric criteria: 7; val: 2 + +**Seed 0.0981 → Best 0.4358 (+0.3377)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_screen-set/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ux_journey_screen-set/best_prompts.json new file mode 100644 index 0000000..4c89d25 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_screen-set/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'screen-set' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Several distinct app screens are shown side by side\n- The screens tell a sequence/flow (step 1 \u2192 2 \u2192 3)\n- Each screen/frame has a title/caption\n- Each frame looks like a real screen, not a diagram\n- Consistent visual style across the frames\n- The flow/transition between frames is indicated\n- The set has an overall title naming the flow\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_screen-set/report.md b/scripts/experiments/gepa-flowchart/overnight/ux_journey_screen-set/report.md new file mode 100644 index 0000000..2188772 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_screen-set/report.md @@ -0,0 +1,5 @@ +# GEPA journey: screen-set + +rubric criteria: 7; val: 2 + +**Seed 0.3788 → Best 0.3788 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_sitemap/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ux_journey_sitemap/best_prompts.json new file mode 100644 index 0000000..243a87d --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_sitemap/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'sitemap' board rendered as a visual tree on a FIXED finite canvas.\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nOUTPUT: a concise plain-text plan naming SPECIFIC content (real labels, real structure) with concrete on-canvas (x,y) placement for every node. Generic plans fail.\n\n================= CRITICAL GEOMETRY RULES (most past failures were off-canvas) =================\nCanvas is 1280 wide \u00d7 900 tall, origin top-left. Node \u2248160w \u00d7 50h. Every node must fit fully inside with \u226520px margin (so node CENTER x\u2208[100,1180], y\u2208[85,815]).\n\nTHE #1 PAST MISTAKE \u2014 DO NOT REPEAT IT:\nPast plans placed multiple deeper-level children at the SAME y AND shifted them HORIZONTALLY around the parent (e.g. parent at x=240 \u2192 children at x=160 and x=320, or x=760/x=920). This pushed nodes into neighboring lanes or off-canvas and scored 0.15 with 7\u201311 off-canvas nodes. NEVER split children horizontally. NEVER use x values like 160, 320, 760, 920.\n\nTHE FIX THAT WORKED (scored 0.61, geometry clean):\n- Use ONLY these 5 fixed column x-positions: 240, 440, 640, 840, 1040. Every node sits on exactly one of these x-values. No exceptions.\n- Each primary-nav node owns one column (lane). ALL of its descendants stay in that SAME column (same x).\n- When a parent has 2+ children, do NOT spread them horizontally \u2014 STACK them VERTICALLY in the column at staggered y-values (90\u2013110px apart). Example: parent at (1040,400) \u2192 child A (1040,510) \u2192 child B can stack under A.\n- Keep lanes physically separate: never put two different branches' nodes in the same column at overlapping y unless they belong to the same lane.\n\nY-LEVEL GUIDE (use these as anchors; intermediate stacking y-values are allowed):\n- L0 root: y=60\n- L1 primary nav: y=220\n- L2 sub-pages: y=400 (stack siblings at 400, 510 if needed)\n- L3 deep pages: y=600 (stack at 600, 710 if needed)\n- Never place a node center below y=815.\n\nNODE BUDGET: 14\u201318 nodes MAXIMUM (15 is ideal). Fewer well-placed nodes score far higher than a sprawling tree. Do NOT enumerate every sub-page. Show depth (L3) on only 1\u20132 branches; keep the rest shallow.\n\nSTRUCTURE: 1 root + 4\u20135 primary nav nodes (one per column) + a SELECTIVE set of sub-pages. Max depth 4 levels (L0\u2013L3).\n\n================= LAYOUT (drives comprehension) =================\n- Strict top-to-bottom tree. Root centered at top (640, 60).\n- Primary nav nodes in a single horizontal row at y=220, each at one of x=240/440/640/840/1040.\n- Sub-pages stack DIRECTLY below their parent in the SAME column (same x), at increasing y.\n- Orthogonal right-angle connectors only; one edge per parent\u2192child link.\n- Because every descendant shares its lane's x, edges run vertically and never cross unrelated nodes.\n\n================= DOMAIN GUIDANCE BY USE CASE =================\n- Mobile app: root = \"Launch \u2192 Home Tab Bar\"; L1 = bottom-nav tabs (Home, Explore, Cart, Profile); L2 = key screens per tab; deepen 1\u20132 tabs to L3 (e.g., Cart \u2192 Cart Review \u2192 Checkout \u2192 Payment Method; Profile \u2192 Account Overview \u2192 Edit Profile \u2192 Settings). Stack the deep chain vertically in the same column.\n- E-commerce site: root = \"Home (/)\"; L1 = Shop, Deals, Account, Cart & Checkout, Support (show URL paths). Deepen ONLY Shop to L3 (Categories \u2192 one product category \u2192 Product Detail). Do NOT expand all six. Stack Shop's deep nodes vertically in the x=240 column.\n- SaaS app: root = \"Dashboard (Home)\"; L1 = Projects, Reports, Team, Billing, Settings; L2 = one sub-page each; deepen 1\u20132 (e.g., Project List \u2192 Project Detail \u2192 Activity Log) stacked vertically in that column.\n\n================= RUBRIC (satisfy EVERY criterion) =================\n1. Home/root is the single top node.\n2. Pages nest under their parent.\n3. Navigation groups/sections clearly labeled.\n4. Multiple depth levels shown (primary nav \u2192 sub-pages \u2192 at least one deeper page = 4 levels).\n5. Each page clearly labeled with a specific real name.\n6. Laid out top-to-bottom as a tree (y increases by level).\n7. No edge passes through an unrelated node (enforced via single-column-per-lane stacking).\n\n================= REQUIRED OUTPUT STRUCTURE =================\n1. One-line Layout Strategy (compact tree, node count, \"all on column x\u2208{240,440,640,840,1040}, all y\u2208[60,815]\" fit statement).\n2. Node list by level, each: label + (x, y). \u226418 nodes. Confirm every x is one of 240/440/640/840/1040.\n3. Edge list: parent \u2192 child pairs, confirming each child shares its parent's column (same x).\n4. Brief Rubric Compliance check (one line per criterion).\n\nSELF-CHECK before finishing:\n- Is every x one of 240, 440, 640, 840, 1040? (If not, fix it.)\n- Is every y between 60 and 815? (If not, fix it.)\n- Do all descendants of a primary-nav node share that node's x? (If not, fix it.)\n- Are there \u226418 nodes? (If not, cut.)\n\nKeep it tight. Priorities: (a) all nodes on-canvas via the 5 fixed columns + vertical stacking, (b) readable compact tree, (c) full rubric coverage.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_sitemap/report.md b/scripts/experiments/gepa-flowchart/overnight/ux_journey_sitemap/report.md new file mode 100644 index 0000000..0445018 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_sitemap/report.md @@ -0,0 +1,5 @@ +# GEPA journey: sitemap + +rubric criteria: 7; val: 2 + +**Seed 0.1546 → Best 0.5583 (+0.4037)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_user-flow/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ux_journey_user-flow/best_prompts.json new file mode 100644 index 0000000..2b7dfd7 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_user-flow/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'user-flow' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Each node represents a screen/state of the app\n- Edges are labeled with the user action/trigger causing the transition\n- A clear entry/start screen is identifiable\n- Decision points (e.g. logged-in vs not, success vs error) branch\n- A primary happy-path is distinguishable\n- End/goal states are marked\n- Laid out top-to-bottom\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_user-flow/report.md b/scripts/experiments/gepa-flowchart/overnight/ux_journey_user-flow/report.md new file mode 100644 index 0000000..d769303 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_user-flow/report.md @@ -0,0 +1,5 @@ +# GEPA journey: user-flow + +rubric criteria: 7; val: 2 + +**Seed 0.2527 → Best 0.2527 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_wireframe/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/ux_journey_wireframe/best_prompts.json new file mode 100644 index 0000000..3f48bce --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_wireframe/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "Plan a 'wireframe' board.\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nThis is judged against a strict rubric \u2014 a great board MUST satisfy EVERY criterion:\n- Low-fidelity look: greyscale boxes/placeholders, minimal color or imagery\n- Layout regions are blocked out (header, sidebar, content, footer)\n- Placeholder text/image blocks stand in for real content\n- Annotations/notes explain regions or interactions\n- Elements align to a clear grid/structure\n- Key UI elements (nav, buttons, inputs, lists) are indicated as boxes\n- Titled with the screen/page name\n\nOutput a concise plain-text plan naming the SPECIFIC content (real values, labels, structure) that satisfies each criterion. Be concrete; generic plans fail.", + "generate": "{skill_context}\n\nUse case: {topic}\nReader: {audience}\nGoal: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single valid JSON object of the declared type that satisfies EVERY rubric criterion. Bake in the SPECIFIC values the plan names \u2014 no placeholders. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/ux_journey_wireframe/report.md b/scripts/experiments/gepa-flowchart/overnight/ux_journey_wireframe/report.md new file mode 100644 index 0000000..0aacaae --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/ux_journey_wireframe/report.md @@ -0,0 +1,5 @@ +# GEPA journey: wireframe + +rubric criteria: 7; val: 2 + +**Seed 0.4181 → Best 0.4181 (+0.0000)** diff --git a/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_opus/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_opus/best_prompts.json new file mode 100644 index 0000000..bcb32b4 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_opus/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a downstream diagram generator (a React Flow-style auto-layout engine). Your plan's quality is judged on TWO things, BOTH of which must score well:\n(1) COMPREHENSION \u2014 a newcomer can answer specific factual questions from the rendered diagram, and\n(2) VISUAL RENDERING \u2014 the diagram fits on one screen with no clipped/off-canvas nodes and no edges that cross unrelated nodes.\n\nCRITICAL LESSON FROM PAST FAILURES: Plans with perfect content still scored ~0.4 overall because the rendered board pushed 18\u201326 nodes off-canvas, clipped everything after the first pane, and produced edges that ran across unrelated nodes. The content was right but UNREADABLE. You must treat rendering geometry as a first-class constraint, not an afterthought. A smaller plan that fully renders beats a complete plan that clips.\n\nInputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nProduce a concise plain-text plan describing what the flowchart must show. Follow ALL rules below.\n\n============================================================\n=== HARD RENDERING CONSTRAINTS (these caused past failures) ===\n============================================================\n\n1. TOTAL NODE BUDGET ACROSS THE WHOLE PLAN: 12\u201318 nodes MAXIMUM, summed over ALL panes. The generator lays out every node you describe; past plans assumed each pane rendered separately and blew the budget (3 panes \u00d7 8\u20139 nodes = 24+ nodes \u2192 26 off-canvas). Count EVERY box you mention \u2014 including every decision, every terminal state, and every intermediate step \u2014 and keep the grand total at or below 18.\n\n2. PER-PANE BUDGET: each pane is 4\u20137 nodes. If you genuinely cannot fit the topic in 18 total nodes, KEEP ONLY the 2\u20133 panes that best serve the stated Purpose and explicitly drop the rest into \"Out of Scope.\" Do NOT silently include a fourth pane that will be clipped. It is better to fully cover 2 panes than to truncate 3.\n\n3. PREFER FEWER PANES. Two well-rendered panes outscore three clipped ones. Only use 3 panes if the total still fits in 18 nodes (i.e., ~6 nodes each). Never plan 4 panes.\n\n4. NO EDGE-OVER-NODE GEOMETRY. The single most common error was a decision node branching to its \"happy path\" successor while a failure/side node sat physically between them (e.g., `D1 \u2192 main_step` rendered ON TOP OF \"side_node\"). To prevent this:\n - For any decision with a \"main/continue\" branch and a \"side/failure\" branch, route the MAIN branch to the IMMEDIATELY-next node, and send the SIDE branch to a TERMINAL node placed off to the side (a leaf with no further outgoing edges). Failure/retry/escalation branches should END in a clearly-labeled terminal state, NOT route back into the main column past other nodes.\n - Do NOT create edges that skip over an intermediate node to reach a later one. Every edge should connect adjacent nodes in reading order or go to a leaf terminal.\n - Do NOT create edges that jump from one pane into the middle of another pane. Panes connect only via a single clean hand-off from the last node of one pane to the first node of the next (or, better, treat panes as fully independent \u2014 see #5).\n\n5. TREAT PANES AS INDEPENDENT, SELF-CONTAINED DIAGRAMS. Each pane should have its own Entry/Trigger and its own Terminal States. Inter-pane references should be a label (\"\u2192 continues to Pane B\"), NOT a drawn edge spanning panes. This keeps each pane laying out cleanly.\n\n6. NO LONG-RANGE OR BACK-JUMPING EDGES. Loop-backs/retries should connect to an adjacent node or resolve into a terminal \"retry exhausted \u2192 escalate\" leaf \u2014 never an arrow spanning the whole chart.\n\n7. KEEP TERMINAL STATES FEW (2\u20134 per pane) and place each as a leaf.\n\n8. Suggest a simple layout direction (usually top-to-bottom) OR let the engine choose. Do not over-specify layout.\n\n============================================================\n=== CONTENT: BE CONCRETE, NOT GENERIC ===\n============================================================\nWithin the tight node budget, every node you DO include must carry the specific detail a reader will ask about. When choosing what to keep, prioritize the nodes/details that answer the Purpose's implied factual questions (the reader will be quizzed on triggers, ordering, actors, mappings, and failure paths). For each step/decision/trigger, NAME the conventional value rather than describing it abstractly. Always specify:\n\n- TRIGGERS WITH VALUES: concrete thresholds, durations, counts, conditions (e.g., \"CPU > 80% sustained 5 min\", \"3 missed heartbeats / 15s timeout\", \"majority = N/2+1 nodes\"). State a representative real-world default; never say \"a threshold\" or \"a grace window.\"\n- EVALUATING ACTOR: for each decision, name WHO/WHAT evaluates it (the node, a leader/coordinator, a quorum service, CI, on-call engineer, incident commander). State explicitly automated vs. human-in-the-loop approval.\n- STATE/DATA DETAILS: who creates/increments/persists/validates it, where it's stored, at which step it's checked or causes rejection.\n- FAILURE PATHS: for every fallible action or external dependency, specify behavior \u2014 block, retry (how many / what backoff), escalate (to whom / after what timeout), or fail-safe/rollback. Each failure branch must terminate in an explicit leaf node; never leave it implied. (But remember: each such terminal counts against the node budget \u2014 keep them tight.)\n- ORDERING: state the explicit sequence of major operations, AND mark which steps are automated vs. manual where the audience would ask.\n- MAPPING TABLES / LEGENDS: when the topic has a fixed mapping (commit prefix \u2192 version bump, severity tiers, category \u2192 queue, impact\u00d7urgency \u2192 priority), list the FULL mapping inline IN THE LEGEND. Mapping tables live in the text legend, not as separate nodes, so they don't consume the node budget but are still answerable.\n- NEWCOMER CONTEXT: a short legend defining domain terms the audience-newcomer needs, with concrete values.\n\nPush exhaustive mappings, rate tables, SLA tables, and definitions into the LEGEND text (section 2). Reserve actual flowchart NODES for the flow logic only. This is how you stay within the node budget while keeping comprehension high.\n\n============================================================\n=== OUTPUT FORMAT (plain text, labeled sections) ===\n============================================================\n1. Purpose & Scope (1\u20132 sentences).\n2. Context for Newcomers \u2014 legend: terms + ALL fixed mappings/tables with concrete values (this carries comprehension detail without using nodes).\n3. Panes \u2014 name each pane, what it covers, and its node count. Then state the GRAND TOTAL node count and confirm it is \u2264 18.\n4. For each pane: \n - Entry/Trigger (concrete condition),\n - Key Steps/States (ordered, concrete details + responsible actor, automated vs manual),\n - Decisions & Branch Triggers (each: specific threshold/condition; evaluating actor; every branch's destination; main branch \u2192 next node, failure/timeout/retry/escalation branch \u2192 a leaf terminal),\n - Terminal States (leaves, 2\u20134).\n - Explicitly confirm no edge skips over an intermediate node and no edge crosses into another pane.\n5. Out of Scope \u2014 state explicitly what is excluded, INCLUDING any panes/details you dropped to stay within the node budget.\n\nBefore finalizing: re-count every node across all panes. If the total exceeds 18, cut content (drop a pane to Out of Scope or merge steps) until it fits. Verify every failure branch ends in a leaf and no edge runs past an intermediate node. Prioritize concrete values, named actors, and explicit failure handling over volume \u2014 but never at the cost of fitting on one screen.", + "generate": "You generate a flowchart `flow` diagram as JSON that renders cleanly in a React Flow canvas and is readable as a SINGLE screenshot without panning or zooming.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Output ONLY the JSON (no prose, no code fences).\n\n=================================================================\nTOP PRIORITY: THE IMAGE MUST FIT ON SCREEN\nPast diagrams were logically perfect but FAILED visually. The recurring, score-killing problems were:\n - Nodes pushed OFF the visible canvas (most common \u2014 happened in nearly every example).\n - Node-pair OVERLAPS (boxes rendered on top of each other).\n - Boxes CLIPPED at the right/bottom edges, cutting off label text.\n - Overly long labels forced into tall, narrow, multi-line boxes that overflow and crowd.\n - Three panes spread the graph too wide so the rightmost pane was cut off.\n\nThese are caused by (a) too many nodes, (b) labels that are too long, and (c) layouts that grow too wide/tall. The rules below exist to prevent exactly these failures. Treat compactness as more important than completeness of detail in any single label.\n\n=================================================================\nOUTPUT SCHEMA\n{\n \"direction\": \"TB\" | \"LR\",\n \"nodes\": [\n { \"id\": \"<unique-id>\", \"data\": { \"label\": \"<string>\", \"status\": \"<status>\" }, \"group\": \"<groupId>\" }\n ],\n \"edges\": [\n { \"source\": \"<id>\", \"target\": \"<id>\", \"data\": { \"label\": \"<condition/trigger>\" } }\n ],\n \"groups\": [\n { \"id\": \"<groupId>\", \"label\": \"<pane title>\", \"color\": \"<hex>\" }\n ]\n}\n- `status` values: \"info\" (entry/trigger points), \"active\" (process/action steps), \"neutral\" (decision/evaluation nodes or neutral terminals), \"warn\" (failure/error/escalation/abort), \"success\" (successful terminals).\n- Edge `data.label` is optional; use it ONLY for conditions/triggers/branch outcomes. Unconditional sequential edges need no label.\n- Every node must belong to a group. Every group needs a distinct color.\n\n=================================================================\nHARD SIZE LIMITS (do not exceed \u2014 these prevent off-canvas/overlap failures)\n\nL1. TOTAL NODES \u2264 15. Aim for 12\u201314. If the plan lists more, consolidate steps and drop only out-of-scope detail \u2014 never silently drop in-scope branches.\n\nL2. AT MOST 2 PANES. Two panes render within one screen; three panes consistently pushed content off-canvas in past renders. Only use a 3rd pane if the plan explicitly defines three AND the total stays \u2264 15 nodes with very short labels. If you would need a 3rd pane only to hold shared failure/abort terminals, instead keep each abort terminal as a leaf inside the pane that triggers it (see L4) rather than adding a pane.\n\nL3. LABEL LENGTH: keep each node label \u2264 ~90 characters and ideally on 1\u20132 lines. Long labels become tall narrow boxes that overflow. Pack the concrete facts (see CONTENT RULES) tersely; drop filler words, articles, and restated context. Prefer \"OCSP/CRL revoked? (hard-fail)\" over a full sentence. Use abbreviations the audience knows.\n\nL4. WIDTH CONTROL. A pane that fans out into many side-leaves grows too wide and clips. To stay narrow:\n - Keep the main flow a single vertical (TB) spine.\n - Each decision should branch to AT MOST one side-leaf (e.g., a failure terminal) plus the main next node. Place the side-leaf immediately beside its decision.\n - Do NOT create one shared distant \"sink\" node that many decisions point to from across the graph \u2014 those long edges cross unrelated nodes. Give each failing decision its OWN adjacent terminal leaf instead.\n\n=================================================================\nLAYOUT RULES (drive the geometry/visual score)\n\n1. SPLIT INTO PANES (groups) per the plan, but obey L2 (\u2264 2 panes). Each pane = one group. ~6\u20139 nodes per pane.\n\n2. NO EDGE-OVER-NODE CROSSINGS \u2014 a top failure. An edge from a node to a NON-ADJACENT node visually cuts across nodes between them.\n - Order nodes within a pane so edges connect ADJACENT nodes. Lay the main path out linearly in the flow direction.\n - Branch edges (decision \u2192 failure terminal) must go to an ADJACENT leaf. Never route a branch edge over 2+ intermediate nodes.\n - Loop-back edges (e.g., retry \u2192 earlier step) are allowed ONLY when source and target are adjacent/near-adjacent. If a back-edge would jump over 2+ nodes, restructure.\n - Concretely AVOID: `decisionA\u2192sink`, `decisionB\u2192sink`, `decisionC\u2192sink` where `sink` sits far away. Use per-decision adjacent terminals instead.\n\n3. DIRECTION. Default \"TB\" for mostly-linear flows. Use \"LR\" only if a pane is short and has many small parallel side-branches. Minimize crossings and overflow.\n\n4. CROSS-PANE HANDOFF: do NOT draw edges between panes. Reference the handoff in a label (e.g., \"\u2192 continues in Pane B\"). Pane B has its own entry node.\n\n=================================================================\nCONTENT / LABEL RULES (drive comprehension) \u2014 keep them TERSE per L3\n\n5. LABELS MUST BE SPECIFIC AND SELF-CONTAINED, but compact. Bake concrete facts from the plan directly into labels:\n - Concrete numbers: thresholds, timeouts, retries, backoffs, TTLs, %s, SLAs, quorums (e.g., \"timeout 150ms = 3 missed beats\", \"retry: nextIndex\u22121\", \"quorum N/2+1\", \"SLA 10 biz days, reminders d5/d8\", \"skew \u00b15min\").\n - WHO performs each step (actor/component): e.g., \"Leader evaluates\", \"Client TLS lib\", \"Compliance Officer\", \"Proc. Officer\". Readers consistently ask which actor owns a step.\n - For gates/checklists, ENUMERATE the actual criteria, abbreviated (e.g., \"trusted root + dates valid + not revoked + SAN match\", \"score\u226550, COI\u2265$1M, no sanctions match\"). Naming the gate alone is insufficient.\n - Comparison method / sample size when the plan gives it.\n - Failure/edge-case paths must appear as explicit NODES/EDGES, not be implied: timeouts, retries-exhausted, dependency unreachable, write failure, rollback/abort, escalation when SLA exceeded, stale-leader step-down, etc. (Trim these to short labels but keep them present.)\n\n6. DECISION NODES: phrase as a question; label EACH outgoing edge with the branch outcome (e.g., \"ACK\", \"REJECT\", \"out of range\", \"all docs \u226410d\", \"no common suite\"). Decision edge labels were noted as missing/faint in feedback \u2014 always include them.\n\n7. TERMINAL STATES: explicit nodes with proper status (\"success\" good ends, \"warn\" error ends, \"neutral\" held/archived). Prefix clearly: \"TERMINAL: ...\" or \"ABORT: ...\". Keep terminal labels short.\n\n8. COVER THE PLAN'S SCOPE: entry triggers, key steps, every in-scope decision/branch, all terminal states. Don't invent steps beyond scope; don't drop in-scope branches. If a topic detail (e.g., snapshot fallback, HelloRetryRequest) is NOT in the plan's scope, do not add it.\n\n=================================================================\nPROCESS (follow before emitting)\n- Map plan panes \u2192 groups (\u2264 2), assign distinct hex colors.\n- List all required nodes; if > 15, consolidate until \u2264 15.\n- Within each pane, order nodes along the main spine so edges stay short and adjacent.\n- Give each failing decision its OWN adjacent terminal leaf (no distant shared sink, no abort pane).\n- Shorten every label to \u2264 ~90 chars / 1\u20132 lines while preserving the concrete numbers and actor.\n- Mentally re-check: Would any edge cross an unrelated node? Would the layout be wider than ~2 panes or taller than one screen? Would any box clip? If yes, restructure (fewer nodes, shorter labels, tighter branching) BEFORE emitting.\n\nOutput ONLY the final JSON object." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_opus/report.md b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_opus/report.md new file mode 100644 index 0000000..acff104 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_opus/report.md @@ -0,0 +1,19 @@ +# GEPA flowchart optimization — report (RECOVERED from gepa_state after timeout) + +Run timed out (rc=124) before the post-optimize report write; recovered from gepa_state.bin. +- iterations completed: 9 total evals: 172 +- candidates discovered: 7 + +## Val-aggregate (WHM) per candidate +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) | 0.4163 | 0.482 | 0.325 | 0.500 | [None] | +| 1 | 0.4717 | 0.585 | 0.320 | 0.644 | [0] | +| 2 | 0.4654 | 0.603 | 0.326 | 0.563 | [1] | +| 3 | 0.4737 | 0.594 | 0.336 | 0.597 | [1] | +| 4 | 0.5434 | 0.665 | 0.434 | 0.581 | [3] | +| 5 | 0.4791 | 0.562 | 0.351 | 0.650 | [1] | +| 6 **BEST** | 0.5907 | 0.750 | 0.489 | 0.528 | [4] | + +**Seed 0.4163 → Best (idx 6) 0.5907 (+0.1744)** + diff --git a/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_sonnet/best_prompts.json b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_sonnet/best_prompts.json new file mode 100644 index 0000000..f6b09cb --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_sonnet/best_prompts.json @@ -0,0 +1,4 @@ +{ + "brainstorm": "You are planning a flowchart that will be fed to a DOWNSTREAM DIAGRAM GENERATOR. Your plan must produce a chart that is both comprehensible AND renders cleanly on a finite canvas. Past plans scored well on text comprehension but FAILED badly on visual rendering (nodes off-canvas, edges crossing unrelated nodes, content cut off, sparse/wasted space). Your top priority is fixing these problems.\n\nInputs:\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nProduce a concise plain-text plan that the generator can turn into a clean, complete, readable board.\n\n=== CRITICAL CONSTRAINTS FOR CLEAN RENDERING ===\nThe downstream generator struggles when graphs are large or have long-range edges. To prevent off-canvas nodes, edge-over-node crossings, and cut-off content, you MUST:\n\n1. KEEP IT SMALL. Cap the diagram at roughly 12\u201316 total nodes. Do NOT enumerate every possible state. Collapse minor/related steps into single nodes. If the topic is large, explicitly tell the generator to SPLIT into multiple panes/sub-flowcharts (e.g., \"Pane 1: Detection\", \"Pane 2: Recovery\") rather than one giant chart.\n\n2. NO LONG-RANGE EDGES. Every edge should connect adjacent nodes in the layout. Avoid edges that skip across the chart (e.g., a deep failure branch jumping back to the top). Instead, route failure/halt branches to a LOCAL terminal node near the decision (e.g., a nearby \"FULL HALT\" or \"Exit\" node), or use a clearly labeled connector reference instead of a drawn line. Explicitly instruct: \"do not draw edges that cross over unrelated nodes.\"\n\n3. SPECIFY DIRECTION & LAYOUT. Recommend a concrete `direction` (top-to-bottom for a single linear spine; left-to-right if branches are wide). Keep the layout compact\u2014warn against sparse layouts with large empty/nested boxes. Do not create empty container boxes.\n\n4. AVOID DEEP NESTED CONTAINERS / SWIMLANES unless they are small; the generator clips wide swimlanes and nested boxes off the canvas. Prefer simple labeled nodes over elaborate containers, legends-as-sidebars, or minimaps.\n\n5. ENSURE COMPLETENESS ON CANVAS. List nodes in a single clear linear order so nothing gets pushed off the bottom or sides. Put the most important decision logic early/central.\n\n=== CONTENT REQUIREMENTS (comprehension) ===\nFor the reader to fully understand, ALWAYS make these explicit in the plan (these were the recurring \"reader could not answer\" gaps):\n\n- ACTORS / DECISION OWNERSHIP: For every decision and action, name WHO or WHAT performs it (e.g., the node itself, a leader/coordinator, an external service, an automated controller, or a human/operator). Explicitly distinguish AUTOMATED steps from MANUAL steps, and call out any HUMAN-IN-THE-LOOP approval (who approves what).\n\n- CONCRETE THRESHOLDS & PARAMETERS: Include specific example values, not just abstract conditions \u2014 e.g., \"CPU > 80% for 5 minutes\", \"heartbeat missed > 3\u00d7 interval\", \"quorum = 3 of 5\", token/epoch numbers. Name the component that evaluates each condition.\n\n- BRANCH TRIGGERS: For every decision, state exactly what triggers each branch (Yes/No conditions) and where each branch leads, including FAILURE paths (e.g., what happens when fencing/STONITH is unreachable, when a scaling action fails, when tag exists or publish fails after tag succeeds).\n\n- ORDER OF OPERATIONS: Make the sequence of steps explicit and visible (e.g., detection \u2192 quorum recalc \u2192 token bump \u2192 client rejection; or version calc \u2192 changelog \u2192 tag \u2192 build \u2192 publish \u2192 rollback-on-failure).\n\n- NEWCOMER CONTEXT: Briefly define domain terms inline (as short node annotations, NOT a large separate legend box that gets clipped).\n\n=== DOMAIN FACTS TO REUSE WHEN RELEVANT ===\n- Split-brain prevention: split-brain = two nodes both act as primary (data corruption); fencing token = monotonically increasing number from lock service, must be current to write; quorum = majority (e.g., 3 of 5); STONITH/fencing = forcibly isolate/power off a node; epoch = generation counter incremented on leadership change. Typical flow: Normal \u2192 heartbeat missed \u2192 threshold exceeded \u2192 quorum service reachable? \u2192 majority reachable? \u2192 hold current token? \u2192 fence minority \u2192 fencing confirmed? \u2192 Safe Degraded Mode (new token) \u2192 quorum restored + operator approval \u2192 recovery \u2192 rejoining token valid (\u2264 epoch = stale \u2192 wipe/resync). Minority/uncertain segments must FULL HALT (read-only). Name the actor that decides halt-vs-degrade and require human sign-off for recovery.\n- Incident response / auto-scaling: alert states = OK, Pending, Firing, Resolved; Pending exists to avoid transient spikes (must breach for a minimum duration). Severities P1/P2/P3 with concrete thresholds. Auto-scaling policies map alert types to scaling actions (not all eligible); cooldown prevents thrashing; monitoring window observes recovery before escalating; scale-in is delayed until sustained recovery (anti-flapping). Include failure path when scaling fails or hits max capacity ceiling \u2192 escalate. Distinguish automated controller actions vs on-call human actions.\n- Semantic versioning / release tagging: Conventional Commits `type(scope): description` (feat:, fix:, chore:, docs:, perf:, refactor:, BREAKING CHANGE:); SemVer MAJOR.MINOR.PATCH; mapping BREAKING\u2192MAJOR, feat\u2192MINOR, fix/perf/refactor\u2192PATCH (with precedence = highest-impact wins). Triggered by merge to protected branch (name the actor/system, e.g., CI). No-release path when only chore/docs/style. Steps order: parse commits since last tag \u2192 determine bump \u2192 pre-release suffix (alpha/beta/rc) handling \u2192 calc version \u2192 tag-already-exists check (fail+manual review) \u2192 generate changelog \u2192 create/push tag \u2192 build artifact \u2192 publish release \u2192 notify; include rollback if tag succeeds but publish fails.\n\n=== OUTPUT FORMAT ===\nOutput concise plain text containing:\n1. One-line statement of what the chart must convey for this audience/purpose.\n2. Recommended layout: `direction`, whether to use a single chart or named panes, and a note to keep \u226412\u201316 nodes with no long-range/crossing edges and no empty containers.\n3. Ordered list of nodes (steps/states), each with its actor and any inline context/threshold.\n4. Decision points: each with the evaluating component/actor, the trigger condition (with concrete values), each branch's Yes/No outcome and destination, including failure paths routed to LOCAL terminal nodes.\n5. A compact linear flow summary showing adjacency only.\n\nBe specific, be complete on the listed content gaps, and above all keep the diagram small and locally-connected so it renders fully and cleanly.", + "generate": "You generate a termchart `flow` diagram as JSON.\n\n{skill_context}\n\nTopic: {topic}\nAudience: {audience}\nPurpose: {purpose}\n\nPlan to follow:\n{plan}\n\nProduce a single flow JSON object. Use clear, specific labels; label edges with the condition/trigger; group related nodes. Output ONLY the JSON." +} \ No newline at end of file diff --git a/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_sonnet/report.md b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_sonnet/report.md new file mode 100644 index 0000000..09253c4 --- /dev/null +++ b/scripts/experiments/gepa-flowchart/overnight/whm_hybrid_sonnet/report.md @@ -0,0 +1,13 @@ +# GEPA flowchart optimization — whm_hybrid_sonnet (RECOVERED from gepa_state) + +- iterations: 2 total evals: 82 candidates: 4 + +| idx | val_agg | comprehension | geometry | visual_quality | parent | +|---|---|---|---|---|---| +| 0 (seed) | 0.3784 | 0.464 | 0.312 | 0.409 | [None] | +| 1 **BEST** | 0.4689 | 0.605 | 0.349 | 0.497 | [0] | +| 2 | 0.4357 | 0.605 | 0.330 | 0.416 | [1] | +| 3 | 0.4630 | 0.596 | 0.351 | 0.488 | [2] | + +**Seed 0.3784 → Best (idx 1) 0.4689 (+0.0905)** +