ivanmkc · ivanmkc · Jul 2, 2026 · Jul 3, 2026 · Jul 3, 2026
diff --git a/.claude/skills/gepa-optimization/SKILL.md b/.claude/skills/gepa-optimization/SKILL.md
@@ -0,0 +1 @@
+../../../scripts/experiments/gepa-flowchart/SKILL.md
diff --git a/packages/viewer/src/client/viewer.ts b/packages/viewer/src/client/viewer.ts
@@ -19,17 +19,6 @@ const wsid = isShareMode ? "" : (_seg[1] ?? "");
 const apiBase = isShareMode ? `/s/${shareToken}` : `/w/${wsid}`;
 // Read-only: the built-in /w/demo/ showcase AND every share view. Hides clear/console/share controls.
 const READONLY = isShareMode || wsid === "demo";
-// Experimental features (agent console / board history) are hidden by default. Reveal with
-// `?experimental=1` (sticky via localStorage `tc-experimental`); `?experimental=0` hides them again.
-const EXPERIMENTAL = (() => {
-  try {
-    const q = new URLSearchParams(location.search).get("experimental");
-    if (q !== null) localStorage.setItem("tc-experimental", q === "0" || q === "" ? "" : "1");
-    return !!localStorage.getItem("tc-experimental");
-  } catch {
-    return false;
-  }
-})();
 // Honor the OS "reduce motion" setting for auto-advancing (`live`) scopes: render one frame then
 // freeze, instead of looping. `frozenLive` tracks scopes already shown their first frame.
 const REDUCE_MOTION = typeof matchMedia === "function" && matchMedia("(prefers-reduced-motion: reduce)").matches;
@@ -1164,17 +1153,11 @@ if (isShareMode) {
     document.getElementById("history-toggle")?.remove();
     document.getElementById("templates")?.remove(); // keep the public demo from exposing the library
   } else {
-    const templatesBtn = document.getElementById("templates") as HTMLButtonElement | null;  // not experimental
+    initConsole();
+    const ht = document.getElementById("history-toggle");
+    if (ht) ht.onclick = () => toggleHistory(!historyEnabled);
+    const templatesBtn = document.getElementById("templates") as HTMLButtonElement | null;
     if (templatesBtn) { templatesBtn.hidden = false; templatesBtn.onclick = () => void openTemplatesPopover(templatesBtn); }
-    if (EXPERIMENTAL) {
-      initConsole();
-      const ht = document.getElementById("history-toggle");
-      if (ht) ht.onclick = () => toggleHistory(!historyEnabled);
-    } else {
-      // Experimental features off by default — hide the chat console + board-history toggles entirely.
-      document.getElementById("console-toggle")?.remove();
-      document.getElementById("history-toggle")?.remove();
-    }
   }
   const shareBtn = document.getElementById("share") as HTMLButtonElement | null;
   if (shareBtn) shareBtn.onclick = () => void openSharePopover(shareBtn);

diff --git a/scripts/experiments/gepa-flowchart/.gitignore b/scripts/experiments/gepa-flowchart/.gitignore
@@ -2,4 +2,26 @@ __pycache__/
 *.pyc
 .venv/
 node_modules/
+.pytest_cache/
+
+# GEPA run artifacts — keep source, summaries (*.md), reports, and best_* JSON.
+# Everything below is regenerable scratch from a run and is NOT committed.
 runs/
+**/gepa_state.bin
+**/run_log*
+**/candidate_tree.html
+**/candidates.json
+**/generated_best_outputs_valset/
+**/frozen_questions.json
+**/frozen_junior.json
+overnight/frozen/
+
+# scratch example boards
+board_*.json
+consolidation_board.json
+panes_stack.json
+
+# stray logs + generated corpus
+**/*.log
+**/*.log.*
+overnight/corpus.json
diff --git a/scripts/experiments/gepa-flowchart/SKILL.md b/scripts/experiments/gepa-flowchart/SKILL.md
@@ -0,0 +1,174 @@
+---
+name: gepa-optimization
+description: Use when running, extending, or reasoning about GEPA prompt optimization for termchart diagram skills — the runbook (commands, env vars, entry points), the scoring/metric design, the OOD-holdout discipline, and the hard-won gotchas. Covers single-prompt, per-journey, and joint topology-skill optimization.
+---
+
+# GEPA optimization for termchart
+
+GEPA (reflective prompt evolution, `gepa==0.1.1`) optimizes the **prompt-text
+"skills"** termchart uses to author diagrams, scored by **rendering the board in a
+real browser and judging it**. This skill is the methodology + runbook so a future
+session reproduces a run without re-deriving any of it.
+
+Everything below runs from `scripts/experiments/gepa-flowchart/`. Package README
+covers first-time setup (npm build of the viewer, `pip install -e ".[dev]"`); this
+doc is the *how to run it well* layer.
+
+## What GEPA actually is here (mental model)
+
+- A GEPA **candidate is a `dict` of named prompt-text components** — topology skills
+  (`board_layout`, `graph_process_spine`, `graph_entity_lanes`, `graph_zoned_tiers`,
+  `comparison_grid`, `dashboard_grid`, `report_rows`, `screen_frame`, `chart_internal`),
+  a universal `artifact_note`, and per-journey tails (`tail_<journey>`).
+- GEPA does **evolutionary + reflective search over those strings**. It is **NOT**
+  spinning up Claude Code instances or skills. Three LLM roles, each swappable:
+  **generator** (authors the board), **judge/reader** (scores it), **reflector**
+  (proposes prompt edits from feedback).
+- A journey's prompt is **composed by walking its topology path**: `board_layout` +
+  the journey's pattern skill(s) + its tail. Focused composition (only path skills,
+  no global dump) is what prevents cross-artifact rule leakage.
+- Hierarchy: **JOURNEY** (use-case → topology + rubric) → **TOPOLOGY skills** (shared,
+  GEPA-optimized) → **ATOMS** (covered by input schema, no per-atom skill).
+
+## Auth (Vertex is the default in practice)
+
+```bash
+source ~/.profile 2>/dev/null   # remote viewer/env vars live here, NOT in the Bash tool's env
+export CLAUDE_CODE_USE_VERTEX=1 ANTHROPIC_VERTEX_PROJECT_ID=adk-coding-agents \
+       CLOUD_ML_REGION=global PYTHONDONTWRITEBYTECODE=1
+gcloud auth application-default login   # ADC (once)
+```
+`get_client()` auto-selects the Vertex client when `CLAUDE_CODE_USE_VERTEX` (or
+`GEPA_USE_VERTEX`) is set; otherwise it uses the direct Anthropic API
+(`ANTHROPIC_API_KEY`). Gemini judges use `GOOGLE_CLOUD_PROJECT` / `GOOGLE_CLOUD_LOCATION`.
+
+## Entry points (`python -m gepa_flowchart.<module>`)
+
+| Module | Purpose | Key flags |
+|---|---|---|
+| `run` | Single-prompt optimization (brainstorm+generate over topics) | `--smoke` `--max-metric-calls` `--train 8` `--val` `--topics` `--run-dir` |
+| `journey_run` | Optimize ONE journey's components | `--journey <name>` (req) `--max-metric-calls 90` `--val 2` `--run-dir` |
+| `multi_journey_run` | **Joint** GEPA over shared topology skills across journeys | `--group {graph,comparison,dashboard,report,chart,screen}` or `--journeys …`; `--max-metric-calls 200` `--val 1` `--run-dir` (req) `--seed-from <best_topology_skills.json>` (warm start) |
+| `type_run` | Typed-pipeline run (per diagram type) | see `--help` |
+| `holdout_eval` | **OOD gate**: seed vs promoted `SKILL_TEXT` on the untouched holdout, cross-family panel + K=3 | (no args; reads `holdout_journeys.json`) |
+| `ood_overexplain <run_dir>` | OOD eval of a run's `best_topology_skills.json` on **junior + viz** axes | positional run dir (default `overnight/combined_overexplain`) |
+| `topology_regression` | 3-arm FLAT / SCHEMA / TOPO regression on a fixed sample | — |
+| `crosseval` / `cross_round_validate` | Cross-eval best skills on held-out instances / across rounds | — |
+| `judge_agreement` | Inter-judge agreement diagnostics | — |
+| `promote_validated` | Promote validated skills into `skill_library.SKILL_TEXT` | — |
+| `recover` | Rebuild best candidate from `gepa_state.bin` | — |
+
+Outputs land in the `--run-dir`: `report.md`, `best_prompts.json` /
+`best_topology_skills.json`, `frozen_questions.json`, `gepa_state.bin`, `run_log*.txt`.
+
+## The metric (unified scorer — `unified_metric.py`)
+
+Board is rendered in a real browser (viewer + Chromium via `TypedRenderService`) and
+scored on axes, gated by structural validity, combined by **weighted harmonic mean**:
+
+| Axis | What | Default weight | env |
+|---|---|---|---|
+| comprehension | mean(text VQA, vision VQA) from run-frozen reader questions | `w_comp 0.35` | `GEPA_W_COMP` |
+| geometry | mean(heuristic `geometryReport`, rendered-DOM overlaps/offscreen/font) — **dominant** | `w_geom 0.50` | `GEPA_W_GEOM` |
+| visual_quality | legibility / crowding / overlaps / clipping (vision) | `w_vq 0.15` | `GEPA_W_VQ` |
+| junior | per-journey **junior-comprehension** rubric (over-explain for newcomers) | `GEPA_W_JUNIOR` | gated by `GEPA_JUNIOR_RUBRIC=1` |
+| viz | visualization-usage: good prose/diagram mix, products have images, links checked | `GEPA_W_VIZ` | gated by `GEPA_VIZ_USAGE=1` |
+
+- **Which axes span the score + Pareto frontier**: `GEPA_OBJECTIVES` (default all three
+  core axes). `GEPA_COMP_FLOOR` guards comprehension when it's excluded from the score.
+- **`frontier_type` = "hybrid"** (`GEPA_FRONTIER_TYPE`): Pareto front over both val
+  instances and objectives.
+- Default models: generator/judge/reflection/vision `claude-opus-4-8`, reader
+  `claude-sonnet-4-6`. Override with `GEPA_GEN_MODEL` / `GEPA_JUDGE_MODEL` /
+  `GEPA_REFLECTION_MODEL` / `GEPA_VISION_MODEL` / `GEPA_READER_MODEL`.
+
+### Anti-Goodhart machinery (use it — don't defeat it)
+
+- **PoLL panel** (Verga 2024): `GEPA_JUDGE_PANEL="m1,m2,…"` → per-axis **median** across
+  judges + **inter-judge disagreement** signal + **per-axis abstain** on empty responses.
+  Empty panel falls back to `[vision_model]`.
+- **K-sample generation**: `GEPA_GEN_SAMPLES=K` → median over K rolls (`score_sampled`)
+  to average out *generation* variance, the dominant noise source.
+- **Optimize-judge ≠ validation-judge.** Optimize with a cheap judge (e.g. Gemini
+  Flash); **validate/gate with a different, cross-family panel** (`GATE_PANEL` in
+  `holdout_eval`/`ood_overexplain`, K≥3). If both are the same model, you're optimizing
+  the judge's blind spots.
+
+## The OOD-holdout discipline (the gate — non-negotiable)
+
+- `holdout_journeys.json` is **10 out-of-distribution use-cases** kept in a **separate
+  file so optimization physically cannot load it**. **Never** add a holdout journey to
+  a trainset or valset. It is the only honest generalization signal.
+- Validate with `holdout_eval` (promoted-skills gate) or `ood_overexplain <run_dir>`
+  (per-run junior/viz). Both use the cross-family gate panel + K≥3.
+- **Noise-floor verdict**: compare **changed-skill** journeys vs **control** journeys
+  (whose composed prompt didn't change) in the same run. A lift only counts if it clears
+  the control spread. Caveat: if a skill is in *every* journey's path (e.g.
+  `board_layout`), there are **zero controls → no measured noise floor** — say so and
+  treat marginal deltas as unresolved, not wins.
+- **Don't game the metrics.** Two burned lessons: (1) tightening `graph_process_spine`
+  cut variance but **regressed** OOD quality (patient-triage 0.367→0.203) — *variance-down
+  ≠ quality-up*; (2) a viz-protective re-weight made GEPA change **nothing**, proving the
+  junior/viz tension is fundamental for one universal `board_layout`. If a change only
+  moves the number without moving real readability, revert it.
+
+## Running a real job (background, rate-limit-safe)
+
+Detached launch template — the script must be the **FIRST** command in the heredoc, no
+leading `pkill` (a nonzero exit aborts the whole chain):
+
+```bash
+cat > "$CLAUDE_JOB_DIR/tmp/run_x.sh" <<'EOF'
+#!/usr/bin/env bash
+cd /home/ivanmkc/termchart/.claude/worktrees/gepa-flowchart/scripts/experiments/gepa-flowchart
+source ~/.profile 2>/dev/null
+export CLAUDE_CODE_USE_VERTEX=1 ANTHROPIC_VERTEX_PROJECT_ID=adk-coding-agents CLOUD_ML_REGION=global PYTHONDONTWRITEBYTECODE=1
+export GEPA_JUNIOR_RUBRIC=1 GEPA_VIZ_USAGE=1 GEPA_CHECK_LINKS=1
+export GEPA_JUDGE_PANEL=gemini-2.5-flash GEPA_GEN_SAMPLES=2      # lean optimize config under rate limits
+export GEPA_RENDER_PORT=8961 GEPA_VIEWER_PORT=8960               # ISOLATE ports per concurrent run
+exec python3 -u -m gepa_flowchart.multi_journey_run --group graph --run-dir overnight/graph1
+EOF
+chmod +x "$CLAUDE_JOB_DIR/tmp/run_x.sh"
+setsid "$CLAUDE_JOB_DIR/tmp/run_x.sh" >"$CLAUDE_JOB_DIR/tmp/run_x.log" 2>&1 &
+```
+
+**Detect a live run** (the naive proc grep false-matches watcher loops):
+```bash
+ps -e -o cmd | grep -E "python3 .*gepa_flowchart\.(ood_overexplain|multi_journey_run|journey_run|run)" | grep -v grep
+```
+
+## Gotchas (the expensive-to-rediscover list)
+
+- **`gepa_state.bin` is the source of truth**, not the printed log or `report.md`. If a
+  run dies, `recover` / `_recover_best_from_state` rebuilds the best candidate from it.
+- **Self-repair loop is net-negative** — keep `GEPA_SELF_REPAIR` / `GEPA_REPAIR` off.
+  It adds a fix pass that costs more than it recovers.
+- **Component vocab must match the viewer exactly.** Skills must only emit real Mantine
+  component names the renderer knows (e.g. there is **no `Stat`** component). An invented
+  name renders an error block and craters the board.
+- **Isolate render/viewer ports per concurrent run** (`GEPA_RENDER_PORT`/`GEPA_VIEWER_PORT`;
+  e.g. 8899/8898, 8961/8960). Two runs on the same port corrupt each other's renders.
+- **Vertex 429 RESOURCE_EXHAUSTED** is handled by `llm._retry` (exp backoff 5s→120s,
+  7 attempts) then graceful empty-degrade. Under sustained limits, run the **lean config**:
+  single `gemini-2.5-flash` optimize judge + `K=2`.
+- **`gemini-2.5-pro` returns all-zero rubric** (thinking eats the token budget) — keep it
+  OUT of the gate panel; the abstain logic tolerates a flaky judge but a silently-zero one
+  poisons the median.
+- **Schema is where atoms live.** The er-diagram crater was a *schema gap*
+  (`_FLOW_SCHEMA` never documented `type:"entity"` + `fields[]`), not a prompt problem —
+  fix data/schema at the source, not with more prose.
+- Ported wins ship via the **plugin** (`plugin/skills/diagram-recipes/SKILL.md` +
+  version bump), independent of the viewer image and npm CLI. GEPA validates a direction;
+  porting is a separate, guarded step (keep the "don't crowd visuals" space-budget guard).
+
+## Where things are
+
+- Metric: `unified_metric.py` (`score_unified`, `score_sampled`, `check_links`,
+  PoLL panel, junior/viz axes). Config + all `GEPA_*` env parsing: `config.py`.
+- Data/questions: `dataset.py` (`freeze_questions`, junior rubric). Journeys:
+  `journeys.py` + `journeys_catalog.json`; topology skills: `skill_library.py`
+  (`SKILL_TEXT`, `_SEED_TEXT`, `topology_path`).
+- Adapters: `journey_adapter.py`, `hierarchical_adapter.py`, `type_adapter.py`.
+- Render bridge: `type_render.py` (`TypedRenderService`), `geometry_bridge.py`.
+- LLM + retry: `llm.py`. Holdout set: `holdout_journeys.json`.
+- Results write-ups: `overnight/AUTONOMOUS_SUMMARY.md` + `overnight/SUMMARY_*.md`.
diff --git a/scripts/experiments/gepa-flowchart/overnight/SUMMARY.md b/scripts/experiments/gepa-flowchart/overnight/SUMMARY.md
@@ -0,0 +1,10 @@
+# Overnight experiments — held-out comparison
+
+Canonical metric (WHM, weights 0.5/0.3/0.2), held-out topics (12, unseen during optimization): onboarding, state-machine, k8s-deploy, signup-funnel, saga-compensation, oauth-pkce, raft-election, k8s-scheduling, tcp-lifecycle, payment-3ds, blue-green, rate-limiter
+
+| candidate | score | comprehension | geometry | visual_quality |
+|---|---|---|---|---|
+| whm_hybrid_opus | **0.588** | 0.77 | 0.47 | 0.53 |
+| linear_instance | **0.539** | 0.73 | 0.44 | 0.46 |
+| whm_hybrid_sonnet | **0.490** | 0.66 | 0.39 | 0.48 |
+| seed | **0.411** | 0.49 | 0.35 | 0.50 |
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		../../../scripts/experiments/gepa-flowchart/SKILL.md