From f54fd1bfef100340498614c8f72e68b92f8f1213 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivanmkc@google.com>
Date: Fri, 3 Jul 2026 23:09:44 +0000
Subject: [PATCH 1/7] agent-compat: journey-activation harness for
 Claude/Gemini/Antigravity vs termchart

Uses agent-generator's benchmark-runner to drive each external coding-agent
CLI headless on termchart tasks and verify (via a local viewer, deterministically)
whether it activates the right diagram journey for a scenario.

- definitions/: smoke cases (can it push) + journey cases (scenario-only, does it
  pick the right --type). Shared AGENTS.md decision guide; scope-specific,
  trace-independent verify (fair to Antigravity which has no trace).
- run.sh: builds+serves the repo viewer, PATH-wraps termchart (inject viewer creds)
  and antigravity->agy, clears per backend, loops backends, preflights+SKIPs agy
  (its $HOME login can't survive the sandbox's HOME=/tmp).
- agent-compat.yml + run_locally.sh: run in CI or locally via act+podman.

First results (RESULTS.md): Gemini 2/5, Claude 0/5 journey activation; both mis-use
termchart's --type vocabulary and `push` silently accepts invalid types (no
corrective feedback). Antigravity SKIP (auth).
---
 .github/workflows/agent-compat.yml            |  72 ++++++
 scripts/experiments/agent-compat/README.md    |  85 ++++++++
 scripts/experiments/agent-compat/RESULTS.md   |  69 ++++++
 .../agent-compat/definitions/case_sets.yaml   |  17 ++
 .../definitions/cases/termchart_journeys.yaml | 206 ++++++++++++++++++
 .../cases/termchart_signup_flow.yaml          | 102 +++++++++
 .../cases/termchart_status_component.yaml     |  91 ++++++++
 .../definitions/generator_sets.yaml           |  18 ++
 .../agent-compat/definitions/generators.yaml  |  23 ++
 scripts/experiments/agent-compat/run.sh       | 163 ++++++++++++++
 .../experiments/agent-compat/run_locally.sh   |  37 ++++
 11 files changed, 883 insertions(+)
 create mode 100644 .github/workflows/agent-compat.yml
 create mode 100644 scripts/experiments/agent-compat/README.md
 create mode 100644 scripts/experiments/agent-compat/RESULTS.md
 create mode 100644 scripts/experiments/agent-compat/definitions/case_sets.yaml
 create mode 100644 scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml
 create mode 100644 scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml
 create mode 100644 scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml
 create mode 100644 scripts/experiments/agent-compat/definitions/generator_sets.yaml
 create mode 100644 scripts/experiments/agent-compat/definitions/generators.yaml
 create mode 100755 scripts/experiments/agent-compat/run.sh
 create mode 100755 scripts/experiments/agent-compat/run_locally.sh

diff --git a/.github/workflows/agent-compat.yml b/.github/workflows/agent-compat.yml
new file mode 100644
index 0000000..f19d0c8
--- /dev/null
+++ b/.github/workflows/agent-compat.yml
@@ -0,0 +1,72 @@
+name: agent-compat
+
+# Verify external coding-agent CLIs (Claude Code, Gemini CLI, Antigravity) can drive
+# the termchart CLI to push valid boards to a live viewer. Orchestrated by
+# agent-generator's benchmark-runner; the real logic lives in
+# scripts/experiments/agent-compat/ (run.sh + definitions/).
+#
+# Runnable in CI (dispatch) and locally via act+podman
+# (scripts/experiments/agent-compat/run_locally.sh) — cloud-auth steps are ACT-gated.
+
+on:
+  workflow_dispatch:
+    inputs:
+      backends:
+        description: "Space-separated backend sets"
+        default: "tc-claude tc-gemini tc-antigravity"
+      case_set:
+        description: "Case set (pattern ^TC-)"
+        default: "termchart-compat"
+
+jobs:
+  compat:
+    runs-on: ubuntu-latest
+    env:
+      AGENT_GENERATOR_DIR: ${{ vars.AGENT_GENERATOR_DIR || '/home/ivanmkc/agent-generator' }}
+      GOOGLE_CLOUD_PROJECT: ${{ vars.GCP_PROJECT_ID || 'adk-coding-agents' }}
+      ANTHROPIC_VERTEX_PROJECT_ID: ${{ vars.GCP_PROJECT_ID || 'adk-coding-agents' }}
+      CASE_SET: ${{ inputs.case_set }}
+    steps:
+      - uses: actions/checkout@v4
+
+      - uses: actions/setup-node@v4
+        with: { node-version: "20" }
+      - uses: actions/setup-python@v5
+        with: { python-version: "3.11" }
+
+      - name: Install uv
+        run: curl -LsSf https://astral.sh/uv/install.sh | sh
+
+      - name: Install agent CLIs + termchart
+        run: |
+          npm i -g @anthropic-ai/claude-code @google/gemini-cli @ivanmkc/termchart
+          # Antigravity (agy) is not public; run_locally.sh bind-mounts the host binary
+          # under act. In hosted CI the tc-antigravity backend is skipped (agy absent).
+
+      - name: Install benchmark-runner (agent-generator)
+        run: |
+          if [ -d "$AGENT_GENERATOR_DIR" ]; then
+            (cd "$AGENT_GENERATOR_DIR" && uv tool install --prerelease allow .)
+          else
+            echo "::warning::AGENT_GENERATOR_DIR not present; provide agent-generator (bind-mount under act, or clone in CI)"
+            exit 1
+          fi
+          echo "$HOME/.local/bin" >> "$GITHUB_PATH"
+
+      # Hosted CI: authenticate to GCP via WIF (skipped under act — uses mounted ADC).
+      - name: GCP auth (CI only)
+        if: ${{ !env.ACT }}
+        uses: google-github-actions/auth@v3
+        with:
+          workload_identity_provider: ${{ secrets.WIF_PROVIDER }}
+          service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }}
+
+      # Local act: verify a host ADC was mounted (models are Vertex-routed).
+      - name: Verify ADC (act only)
+        if: ${{ env.ACT }}
+        run: |
+          test -f "$HOME/.config/gcloud/application_default_credentials.json" \
+            || { echo "Mount host ADC: run via run_locally.sh"; exit 1; }
+
+      - name: Run agent-compat
+        run: bash scripts/experiments/agent-compat/run.sh ${{ inputs.backends }}
diff --git a/scripts/experiments/agent-compat/README.md b/scripts/experiments/agent-compat/README.md
new file mode 100644
index 0000000..90f0b24
--- /dev/null
+++ b/scripts/experiments/agent-compat/README.md
@@ -0,0 +1,85 @@
+# agent-compat — do Claude Code, Gemini CLI & Antigravity play nice with termchart?
+
+Runs each external coding-agent CLI **headless** on termchart tasks and verifies, via a
+live viewer, that each one:
+
+1. **can drive the `termchart` CLI** to push a valid board (smoke cases), and
+2. **activates the right journey/recipe** — given a *scenario only* (never told the
+   diagram type), does it pick the correct termchart type for the job? (journey cases)
+
+Orchestrated by [agent-generator]'s `benchmark-runner` (interactive-simulation cases,
+`output_format: direct` — prompts go straight to the CLI backend).
+
+## How it works
+
+```
+run.sh
+ ├─ build + run the repo's termchart CLI:  termchart serve  → local viewer, capture URL+token
+ ├─ PATH wrappers (under $HOME, NOT /tmp — the sandbox tmpfs-masks /tmp):
+ │    termchart   → inject viewer URL/token, exec the repo CLI
+ │    antigravity → exec `agy --dangerously-skip-permissions`  (harness calls `antigravity -p …`; agy -p == --print)
+ ├─ for each backend {claude, gemini-cli, antigravity}:
+ │    (antigravity: preflight agy auth under a fresh HOME → SKIP if it can't auth)
+ │    termchart clear --all         → fresh viewer state
+ │    benchmark-runner --config-dir definitions --case-set <set> --generator-set <backend>
+ └─ summary (PASS / FAIL / SKIP per backend)
+```
+
+**Why a `termchart` wrapper, not env vars:** agent-generator's sandbox forwards `PATH`
+to the agent but not arbitrary env (`TERMCHART_VIEWER_URL/TOKEN`). Baking the viewer
+config into a PATH wrapper lets every agent run `termchart push` with zero credential
+handling. It must live under `$HOME` (ro-bound into the sandbox), not `/tmp` (the sandbox
+mounts a fresh tmpfs over `/tmp`, which would hide it).
+
+**Verification is deterministic, scope-specific, and trace-independent.** Each case pins
+a unique scope (`--project … --agent <case>`) but lets the agent choose the `--type`;
+a `command` objective then checks *that scope's* board has the prescribed type via
+`termchart list` (e.g. `/compare …[component]`). Scope-specific ⇒ no cross-case
+contamination / false passes. Trace-independent ⇒ it's the only fair way to score
+**Antigravity**, whose harness produces no tool trace (the `cli_command` objective, which
+needs a trace, is `required: false`).
+
+## Cases (`definitions/cases/`)
+
+| Case set | Cases | Tests |
+|---|---|---|
+| `termchart-smoke` | `TC-FLOW-001`, `TC-COMPONENT-001` | Prompt names the type — can the agent drive `termchart push` at all |
+| `termchart-journeys` | `TC-JOURNEY-{COMPARE,ARCH,METRICS,ER,DASHBOARD}-001` | **Scenario only** — does the agent pick the right type/journey (component / flow / vegalite / flow / panes\|component) |
+| `termchart-compat` | all `TC-*` | smoke + journeys |
+
+Every case seeds one shared `AGENTS.md` "choose the diagram" decision guide (auto-read by
+claude/gemini/agy), so all backends have the same recipe knowledge — the test is whether
+they **activate** the right journey, not whether they know termchart exists.
+
+## Run it
+
+```bash
+scripts/experiments/agent-compat/run.sh                          # all backends, all cases
+CASE_SET=termchart-journeys scripts/experiments/agent-compat/run.sh tc-claude tc-gemini
+scripts/experiments/agent-compat/run_locally.sh                  # via GitHub Actions + act+podman
+```
+
+## Requirements
+
+- `claude`, `gemini`, `agy`, `node` on PATH (run.sh warns if any are missing).
+- `benchmark-runner` (`cd <agent-generator> && uv tool install --prerelease allow .`) or `uv` on PATH.
+- GCP ADC — models are Vertex-routed (`gcloud auth application-default login`).
+- The repo's termchart CLI + viewer are built automatically on first run.
+- `AGENT_GENERATOR_DIR` (default `/home/ivanmkc/agent-generator`).
+
+## Backend status & caveats
+
+- **Claude Code, Gemini CLI — supported and verified.** Both drive termchart correctly;
+  auth is forwarded into the sandbox via env (Vertex).
+- **Antigravity (agy) — SKIPPED by default (auth limitation).** agy authenticates via a
+  login stored in `$HOME`, but agent-generator's sandbox forces `HOME=/tmp`, so agy can't
+  authenticate inside a case (its creds are neither ADC- nor env-based, so they can't be
+  forwarded the way Claude/Gemini creds are). run.sh detects this and marks `tc-antigravity`
+  **SKIP** instead of a misleading FAIL. The wrapper + deterministic check are correct and
+  will work once agy's auth is available in the sandbox — e.g. by extending agent-generator's
+  (stub) antigravity harness `get_sandbox_auth` to mount agy's credential dir, or running
+  agy's backend outside the sandbox. Its harness also extracts no tool trace (stub), so only
+  the deterministic viewer check applies to it.
+- Add a case by dropping a `TC-*` interactive-simulation YAML in `definitions/cases/`.
+
+[agent-generator]: https://github.com/ivanmkc/agent-generator
diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md
new file mode 100644
index 0000000..17244a3
--- /dev/null
+++ b/scripts/experiments/agent-compat/RESULTS.md
@@ -0,0 +1,69 @@
+# agent-compat — first results (2026-07-03)
+
+Ran the harness against Claude Code (`claude`, haiku-4.5), Gemini CLI (`gemini`,
+gemini-3-flash), and Antigravity (`agy`), on this machine (Vertex-routed, local viewer).
+
+## Smoke (`termchart-smoke`) — "can the agent drive `termchart push` at all"
+
+Prompt explicitly names the diagram type.
+
+| Backend | TC-FLOW-001 | TC-COMPONENT-001 |
+|---|---|---|
+| Claude Code | ✅ | ✅ |
+| Gemini CLI | ✅ | ✅ |
+| Antigravity | SKIP (auth) | SKIP (auth) |
+
+Both supported CLIs can author a spec and push it. The basic termchart CLI contract works.
+
+## Journey activation (`termchart-journeys`) — "does it pick the RIGHT journey"
+
+Prompt is **scenario only**; the agent must choose the correct termchart `--type`. All
+backends get the same `AGENTS.md` decision guide (intent → type). Deterministic,
+scope-specific verify (`termchart list` shows the case's board with the prescribed type).
+
+| Scenario → correct type | Claude Code | Gemini CLI |
+|---|---|---|
+| compare laptops → `component` | ❌ used `table` / `comparison` | ❌ used `comparison` |
+| web-app architecture → `flow` | ❌ used `flowchart` | ❌ used `mermaid` |
+| MAU trend → `vegalite` | ❌ used `line` | ✅ `vegalite` |
+| blog DB schema → `flow` | ❌ used `erDiagram` / `erd` | ❌ used `er` |
+| service dashboard → `panes`/`component` | ❌ used `status` / `graph` | ✅ `panes` |
+| **Score** | **0 / 5** | **2 / 5** |
+
+(Identical across two runs, including one where `AGENTS.md` **explicitly** listed the
+valid types and said "do not invent names" — behaviour did not change.)
+
+## Findings
+
+1. **Neither agent reliably activates termchart's `--type` vocabulary.**
+   - **Claude** anchors on **Mermaid / semantic** names (`flowchart`, `erDiagram`, `table`,
+     `line`, `status`) for every case — likely because `termchart --help` leads with its
+     "deterministic Mermaid → ASCII" heritage, so it treats termchart as a Mermaid tool.
+   - **Gemini** gets the termchart-native types right for charts/dashboards
+     (`vegalite`, `panes`) but falls back to invalid semantic names (`comparison`, `er`)
+     or `mermaid` for comparison / architecture / ER.
+   - So the diagram **journeys** (component for comparisons, flow for architecture/ER) are
+     mostly **not** being activated correctly by either CLI in a scenario-only prompt.
+
+2. **`termchart push` silently accepts unknown `--type` values.** `push --type comparison`
+   (or `erDiagram`, `status`, …) exits 0 and stores the board; the viewer then can't render
+   it. Because there is no error, the agent gets **no corrective feedback** and never
+   retries with a valid type. This is the single highest-leverage fix: **validate `--type`
+   in `push` and reject unknown values with the valid list** (`flow | component | vegalite |
+   panes | markdown | mermaid | calltree`). Agents would see the error and self-correct.
+
+3. **Antigravity can't be scored here.** `agy` authenticates via a login stored in `$HOME`,
+   but agent-generator's sandbox forces `HOME=/tmp`; its creds are neither ADC- nor
+   env-based, so (unlike Claude/Gemini's Vertex env) they can't be forwarded. run.sh
+   preflights this and reports SKIP.
+
+## Recommended next steps (not done here)
+
+- **Fix `termchart push` to validate `--type`** (finding #2) — most impactful for agent
+  compatibility; makes termchart self-correcting for any agent. Re-run this suite after.
+- Make the rich viewer types prominent in `termchart --help` / plugin so Claude stops
+  defaulting to Mermaid vocabulary.
+- Forward agy's auth into the sandbox (extend agent-generator's antigravity harness
+  `get_sandbox_auth`) to score Antigravity.
+- Optionally mount the termchart plugin into Claude's sandbox to test its real-world
+  (plugin-equipped) behaviour vs the AGENTS.md-only condition measured here.
diff --git a/scripts/experiments/agent-compat/definitions/case_sets.yaml b/scripts/experiments/agent-compat/definitions/case_sets.yaml
new file mode 100644
index 0000000..90ed91a
--- /dev/null
+++ b/scripts/experiments/agent-compat/definitions/case_sets.yaml
@@ -0,0 +1,17 @@
+kind: case_sets
+
+case_sets:
+  termchart-compat:
+    description: "All termchart CLI compatibility tasks (smoke + journey activation)"
+    patterns:
+      - "^TC-"
+
+  termchart-smoke:
+    description: "Basic push smoke tests (prompt names the type)"
+    patterns:
+      - "^TC-(FLOW|COMPONENT)-"
+
+  termchart-journeys:
+    description: "Journey activation: scenario-only prompts, verify the agent picks the right type"
+    patterns:
+      - "^TC-JOURNEY-"
diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml
new file mode 100644
index 0000000..4b92845
--- /dev/null
+++ b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml
@@ -0,0 +1,206 @@
+# Journey-activation cases: the prompt describes a SCENARIO only (never the diagram
+# type). Each case pins a unique scope (--project tc-journey --agent <case>) but lets
+# the agent CHOOSE the --type, then verifies THAT scope's board has the type the
+# journey prescribes. Scope-specific check => no cross-case contamination and no false
+# pass from another case's board. Outcome-based => trace-independent, fair across
+# Claude / Gemini / Antigravity.
+#
+# All cases share one AGENTS.md "decision guide" (YAML anchor) so every backend has the
+# same recipe knowledge — the test is whether it ACTIVATES the right one.
+benchmarks:
+- benchmark_type: interactive_simulation
+  id: TC-JOURNEY-COMPARE-001
+  case_data:
+    name: "journey: compare options -> component"
+    id: TC-JOURNEY-COMPARE-001
+    description: "Shopping/decision scenario should activate the product-comparison journey (component)."
+    requirements: ["Pick the right termchart diagram type for a comparison and push it."]
+    complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
+    environment_config:
+      resources:
+        - type: file_system
+          name: agents_md
+          description: termchart decision guide, auto-read by claude/gemini/agy.
+          path: AGENTS.md
+          initial_content: &guide |
+            # termchart — pick the RIGHT diagram for the job
+
+            `termchart` is on PATH; the viewer is pre-configured (no token needed).
+            Choose the TYPE that best fits the request, author its JSON, push it, confirm:
+              termchart push --project <p> --agent <a> --type <TYPE> --description "<what>" <file.json> --focus
+              termchart list
+
+            IMPORTANT: --type MUST be exactly one of these termchart types:
+              flow | component | vegalite | panes | markdown | mermaid
+            Do NOT invent other names (NOT "comparison", "erd", "architecture", "line",
+            "graph", "chart"). Map your intent to one of the valid types below.
+
+            Choose the type by intent:
+            - Compare options / products / "which should I pick"  -> component
+              (cards; each option an Image + Title + outbound Anchor link + key specs)
+            - System/service architecture, request flow, pipeline, ER schema, state machine -> flow
+              (nodes+edges, "direction":"TB"; for ER use entity nodes with typed fields)
+            - A metric / trend / distribution over time or category -> vegalite
+              (a Vega-Lite spec: title, axis titles, tooltip)
+            - Dashboard of several independent tiles (latency, errors, uptime) -> panes
+              (grid of component/vegalite tiles) — or one component with stat cards
+            - Notes / write-up / explanation -> markdown
+
+            Shapes:
+            - flow:      {"nodes":[{"id":"n1","data":{"label":"..."}}],"edges":[{"id":"e1","source":"n1","target":"n2"}],"direction":"TB"}
+            - component: {"type":"Stack","children":[{"type":"Title","children":"..."},{"type":"Badge","props":{"color":"green"},"children":"OK"}]}
+            - vegalite:  {"title":"...","data":{"values":[...]},"mark":"line","encoding":{"x":{...},"y":{...}}}
+            - panes:     {"layout":"grid","panes":[{"title":"...","type":"vegalite","content":{...}}]}
+      verification_steps: []
+    agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
+    conversations:
+      - id: compare
+        name: Compare laptops
+        initial_prompt: >
+          I'm choosing a new laptop and can't decide. Compare the MacBook Air M3,
+          the Dell XPS 13, and the Lenovo ThinkPad X1 Carbon for me — price, weight,
+          and battery life — and show it on the termchart viewer so I can eyeball it.
+          Use exactly `--project tc-journey --agent compare` as the scope (you choose
+          the --type). Then run `termchart list` and report what landed.
+        simulant_goal: &sg >
+          Get the agent to choose an appropriate termchart diagram for the scenario and
+          push it to the given scope, then confirm via termchart list. Do NOT tell it
+          which type to use. Close as soon as it reports the list output or an error.
+        simulant_knowledge_system: &sks
+          static_narrative_facts: ["termchart is on PATH and the viewer is pre-configured."]
+          relevant_info_registry: []
+          conversational_playbook:
+            - trigger_condition: The agent shows termchart list output or reports the push succeeded.
+              response: Thanks, that's exactly what I needed.
+              description: Close once a board is pushed.
+            - trigger_condition: The agent reports a termchart command error.
+              response: OK, thanks — please stop here.
+              description: Close on a terminal error.
+        validation_objectives:
+          - type: command
+            command: 'termchart list 2>/dev/null | grep -qE "/compare[[:space:]].*\[component\]"'
+            expected_exit_code: 0
+          - type: cli_command
+            commands: [{ pattern: "termchart push", required: false }]
+
+- benchmark_type: interactive_simulation
+  id: TC-JOURNEY-ARCH-001
+  case_data:
+    name: "journey: architecture -> flow"
+    id: TC-JOURNEY-ARCH-001
+    description: "Architecture/request-flow scenario should activate a flow journey."
+    requirements: ["Pick the right termchart diagram type for an architecture and push it."]
+    complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
+    environment_config:
+      resources:
+        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+      verification_steps: []
+    agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
+    conversations:
+      - id: arch
+        name: Web app architecture
+        initial_prompt: >
+          Help me explain our web app's architecture to a new hire: users hit a load
+          balancer, which forwards to web servers, which read/write a Postgres database
+          and a Redis cache. Put it on the termchart viewer using exactly
+          `--project tc-journey --agent arch` (you choose the --type), then confirm with
+          `termchart list`.
+        simulant_goal: *sg
+        simulant_knowledge_system: *sks
+        validation_objectives:
+          - type: command
+            command: 'termchart list 2>/dev/null | grep -qE "/arch[[:space:]].*\[flow\]"'
+            expected_exit_code: 0
+          - type: cli_command
+            commands: [{ pattern: "termchart push", required: false }]
+
+- benchmark_type: interactive_simulation
+  id: TC-JOURNEY-METRICS-001
+  case_data:
+    name: "journey: metric trend -> vegalite"
+    id: TC-JOURNEY-METRICS-001
+    description: "A metric/trend scenario should activate a vegalite chart journey."
+    requirements: ["Pick the right termchart diagram type for a metric and push it."]
+    complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
+    environment_config:
+      resources:
+        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+      verification_steps: []
+    agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
+    conversations:
+      - id: metrics
+        name: MAU growth
+        initial_prompt: >
+          Show me how our monthly active users grew over the last six months — roughly
+          1000, 2000, 4000, 6000, 9000, then 12000. I want to see the trend on the
+          termchart viewer. Use exactly `--project tc-journey --agent metrics` (you
+          choose the --type), then confirm with `termchart list`.
+        simulant_goal: *sg
+        simulant_knowledge_system: *sks
+        validation_objectives:
+          - type: command
+            command: 'termchart list 2>/dev/null | grep -qE "/metrics[[:space:]].*\[vegalite\]"'
+            expected_exit_code: 0
+          - type: cli_command
+            commands: [{ pattern: "termchart push", required: false }]
+
+- benchmark_type: interactive_simulation
+  id: TC-JOURNEY-ER-001
+  case_data:
+    name: "journey: db schema -> flow (entity)"
+    id: TC-JOURNEY-ER-001
+    description: "A database-schema scenario should activate an ER/entity flow journey."
+    requirements: ["Pick the right termchart diagram type for a schema and push it."]
+    complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
+    environment_config:
+      resources:
+        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+      verification_steps: []
+    agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
+    conversations:
+      - id: er
+        name: Blog schema
+        initial_prompt: >
+          Sketch the database schema for a simple blog: a users table, a posts table,
+          and a comments table, with their key fields and how they relate (a user has
+          many posts; a post has many comments). Put it on the termchart viewer using
+          exactly `--project tc-journey --agent er` (you choose the --type), then
+          confirm with `termchart list`.
+        simulant_goal: *sg
+        simulant_knowledge_system: *sks
+        validation_objectives:
+          - type: command
+            command: 'termchart list 2>/dev/null | grep -qE "/er[[:space:]].*\[flow\]"'
+            expected_exit_code: 0
+          - type: cli_command
+            commands: [{ pattern: "termchart push", required: false }]
+
+- benchmark_type: interactive_simulation
+  id: TC-JOURNEY-DASHBOARD-001
+  case_data:
+    name: "journey: health dashboard -> panes/component"
+    id: TC-JOURNEY-DASHBOARD-001
+    description: "A multi-tile status scenario should activate a dashboard journey (panes or component)."
+    requirements: ["Pick the right termchart diagram type for a dashboard and push it."]
+    complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
+    environment_config:
+      resources:
+        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+      verification_steps: []
+    agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
+    conversations:
+      - id: dashboard
+        name: Service health dashboard
+        initial_prompt: >
+          Give me an at-a-glance health dashboard for three services (auth, payments,
+          search): show each one's p99 latency, error rate, and uptime. Put it on the
+          termchart viewer using exactly `--project tc-journey --agent dashboard` (you
+          choose the --type), then confirm with `termchart list`.
+        simulant_goal: *sg
+        simulant_knowledge_system: *sks
+        validation_objectives:
+          - type: command
+            command: 'termchart list 2>/dev/null | grep -qE "/dashboard[[:space:]].*\[(panes|component)\]"'
+            expected_exit_code: 0
+          - type: cli_command
+            commands: [{ pattern: "termchart push", required: false }]
diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml
new file mode 100644
index 0000000..d3a212f
--- /dev/null
+++ b/scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml
@@ -0,0 +1,102 @@
+benchmarks:
+- benchmark_type: interactive_simulation
+  id: TC-FLOW-001
+  case_data:
+    name: "termchart — push a signup flowchart"
+    id: TC-FLOW-001
+    description: >
+      Verify a coding-agent CLI can drive the termchart CLI to push a React-Flow
+      flowchart to the live viewer. The viewer is pre-configured via a PATH wrapper
+      (no token handling needed by the agent). Pass = a flow board actually lands.
+    requirements:
+      - Use the termchart CLI to push a flow diagram to the viewer.
+    complexity:
+      architecture: ARCH_SINGLE
+      mutation_risk: RISK_READ_ONLY
+      tool_depth: DEPTH_ATOMIC
+      environment: ENV_LOCAL_FILES
+      adk_api_tier: ADK_T1
+    environment_config:
+      resources:
+        - type: file_system
+          name: agents_md
+          description: Cross-tool termchart CLI reference, auto-read by claude/gemini/agy.
+          path: AGENTS.md
+          initial_content: |
+            # termchart CLI (available on PATH)
+
+            `termchart` pushes rich diagrams to an already-configured live viewer.
+            You do NOT need any URL or token — it is wired up for you.
+
+            Push a React-Flow graph:
+            1. Write a JSON file, e.g. `flow.json`:
+               {"nodes":[{"id":"n1","data":{"label":"Step one"}},
+                         {"id":"n2","data":{"label":"Step two"}}],
+                "edges":[{"id":"e1","source":"n1","target":"n2"}],
+                "direction":"TB"}
+            2. Push it:
+               termchart push --project <p> --agent <a> --type flow \
+                 --description "<what it is>" flow.json --focus
+            3. List what's on the viewer:  termchart list
+            4. Fetch a spec back:          termchart pull --project <p> --agent <a>
+
+            Types: flow, component, vegalite, markdown, panes, mermaid.
+      verification_steps: []
+    agent_generation:
+      requirements: []
+      example_conversations: []
+      prompt_template: "direct"
+      output_format:
+        type: direct
+      max_generation_turns: 1
+    conversations:
+      - id: push_signup_flow
+        name: Push a user-signup flowchart
+        initial_prompt: >
+          You have a `termchart` CLI on PATH that pushes diagrams to an
+          already-configured live viewer — no tokens or setup needed.
+
+          Task: render a TOP-DOWN flowchart of a simple user sign-up flow.
+
+          1. Write a JSON file `flow.json` describing a React-Flow graph with 4
+             nodes and connecting edges:
+             "Enter email" -> "Verify email" -> "Create password" -> "Account created".
+             Shape: {"nodes":[{"id":"n1","data":{"label":"Enter email"}}, ...],
+                     "edges":[{"id":"e1","source":"n1","target":"n2"}, ...],
+                     "direction":"TB"}
+          2. Push it:
+             termchart push --project termchart-compat --agent signup --type flow
+             --description "user signup flow" flow.json --focus
+          3. Confirm it landed by running:  termchart list
+
+          Report the final `termchart list` output verbatim. If any termchart
+          command errors, show the exact error text.
+        simulant_goal: >
+          Get the agent to author a flow spec and push it to termchart, then
+          confirm via termchart list. Close as soon as it reports the list output
+          or a terminal error.
+        simulant_knowledge_system:
+          static_narrative_facts:
+            - The termchart CLI is on PATH and the viewer is pre-configured.
+          relevant_info_registry: []
+          conversational_playbook:
+            - trigger_condition: The agent shows termchart list output or reports the push succeeded.
+              response: Thanks, that's all I needed.
+              description: Close the conversation once the board is pushed.
+            - trigger_condition: The agent reports a termchart command error.
+              response: OK, thanks — please stop here.
+              description: Close on a terminal error; the deterministic check decides pass/fail.
+        validation_objectives:
+          # PRIMARY, deterministic, trace-independent — works for ALL backends
+          # incl. antigravity (no trace). The viewer is cleared before each backend
+          # run, so any flow board present was pushed by THIS agent.
+          - type: command
+            command: 'termchart list 2>/dev/null | grep -qE "/signup[[:space:]].*\[flow\]"'
+            expected_exit_code: 0
+          # SECONDARY, informational only (needs a tool trace; present for
+          # claude/gemini, absent for antigravity). required:false so it never
+          # fails a backend that lacks a trace.
+          - type: cli_command
+            commands:
+              - pattern: "termchart push"
+                required: false
diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml
new file mode 100644
index 0000000..18dfb0c
--- /dev/null
+++ b/scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml
@@ -0,0 +1,91 @@
+benchmarks:
+- benchmark_type: interactive_simulation
+  id: TC-COMPONENT-001
+  case_data:
+    name: "termchart — push a status dashboard component"
+    id: TC-COMPONENT-001
+    description: >
+      Verify a coding-agent CLI can drive the termchart CLI to push a Mantine
+      `component` board (a small status dashboard) to the live viewer. Pass = a
+      component board actually lands (deterministic, trace-independent).
+    requirements:
+      - Use the termchart CLI to push a component diagram to the viewer.
+    complexity:
+      architecture: ARCH_SINGLE
+      mutation_risk: RISK_READ_ONLY
+      tool_depth: DEPTH_ATOMIC
+      environment: ENV_LOCAL_FILES
+      adk_api_tier: ADK_T1
+    environment_config:
+      resources:
+        - type: file_system
+          name: agents_md
+          description: Cross-tool termchart CLI reference, auto-read by claude/gemini/agy.
+          path: AGENTS.md
+          initial_content: |
+            # termchart CLI (available on PATH)
+
+            `termchart` pushes rich diagrams to an already-configured live viewer.
+            You do NOT need any URL or token — it is wired up for you.
+
+            Push a Mantine component tree:
+            1. Write a JSON file, e.g. `board.json`, whose top is a
+               {type, props, children} node, e.g.:
+               {"type":"Stack","props":{"gap":"md"},
+                "children":[{"type":"Title","props":{"order":3},"children":"Status"},
+                            {"type":"Badge","props":{"color":"green"},"children":"Healthy"}]}
+            2. Push it:
+               termchart push --project <p> --agent <a> --type component \
+                 --description "<what it is>" board.json --focus
+            3. List what's on the viewer:  termchart list
+      verification_steps: []
+    agent_generation:
+      requirements: []
+      example_conversations: []
+      prompt_template: "direct"
+      output_format:
+        type: direct
+      max_generation_turns: 1
+    conversations:
+      - id: push_status_component
+        name: Push a status dashboard component
+        initial_prompt: >
+          You have a `termchart` CLI on PATH that pushes diagrams to an
+          already-configured live viewer — no tokens or setup needed.
+
+          Task: render a small service STATUS DASHBOARD as a termchart
+          `component` board.
+
+          1. Write a JSON file `board.json` with a Mantine {type, props, children}
+             tree: a Title "Service status", then two or three Badge components
+             (e.g. API "Healthy" green, DB "Degraded" yellow), wrapped in a Stack.
+          2. Push it:
+             termchart push --project termchart-compat --agent status --type component
+             --description "service status dashboard" board.json --focus
+          3. Confirm it landed by running:  termchart list
+
+          Report the final `termchart list` output verbatim. If any termchart
+          command errors, show the exact error text.
+        simulant_goal: >
+          Get the agent to author a component board and push it to termchart, then
+          confirm via termchart list. Close as soon as it reports the list output
+          or a terminal error.
+        simulant_knowledge_system:
+          static_narrative_facts:
+            - The termchart CLI is on PATH and the viewer is pre-configured.
+          relevant_info_registry: []
+          conversational_playbook:
+            - trigger_condition: The agent shows termchart list output or reports the push succeeded.
+              response: Thanks, that's all I needed.
+              description: Close the conversation once the board is pushed.
+            - trigger_condition: The agent reports a termchart command error.
+              response: OK, thanks — please stop here.
+              description: Close on a terminal error; the deterministic check decides pass/fail.
+        validation_objectives:
+          - type: command
+            command: 'termchart list 2>/dev/null | grep -qE "/status[[:space:]].*\[component\]"'
+            expected_exit_code: 0
+          - type: cli_command
+            commands:
+              - pattern: "termchart push"
+                required: false
diff --git a/scripts/experiments/agent-compat/definitions/generator_sets.yaml b/scripts/experiments/agent-compat/definitions/generator_sets.yaml
new file mode 100644
index 0000000..e483fa7
--- /dev/null
+++ b/scripts/experiments/agent-compat/definitions/generator_sets.yaml
@@ -0,0 +1,18 @@
+kind: generator_sets
+
+generator_sets:
+  tc-claude:
+    description: "Claude Code vs termchart"
+    generators: [tc_claude]
+
+  tc-gemini:
+    description: "Gemini CLI vs termchart"
+    generators: [tc_gemini]
+
+  tc-antigravity:
+    description: "Antigravity (agy) vs termchart"
+    generators: [tc_antigravity]
+
+  termchart-agents:
+    description: "All three coding-agent CLIs vs termchart"
+    generators: [tc_claude, tc_gemini, tc_antigravity]
diff --git a/scripts/experiments/agent-compat/definitions/generators.yaml b/scripts/experiments/agent-compat/definitions/generators.yaml
new file mode 100644
index 0000000..3d87e63
--- /dev/null
+++ b/scripts/experiments/agent-compat/definitions/generators.yaml
@@ -0,0 +1,23 @@
+kind: generators
+
+# Three external coding-agent CLIs driven headless against termchart.
+# backend values map to agent-generator's SimulatorBackend enum:
+#   claude       -> `claude ... --dangerously-skip-permissions -p "<prompt>"`
+#   gemini-cli   -> `gemini --yolo ... -p "<prompt>"`
+#   antigravity  -> `antigravity -p "<prompt>"`  (our PATH wrapper execs `agy`)
+generators:
+  tc_claude:
+    type: interactive
+    model: claude-haiku-4-5@20251001
+    backend: claude
+
+  tc_gemini:
+    type: interactive
+    model: gemini-3-flash-preview
+    backend: gemini-cli
+
+  # antigravity harness ignores --model (base_cmd is hardcoded); value is a placeholder.
+  tc_antigravity:
+    type: interactive
+    model: default
+    backend: antigravity
diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh
new file mode 100755
index 0000000..820bc0e
--- /dev/null
+++ b/scripts/experiments/agent-compat/run.sh
@@ -0,0 +1,163 @@
+#!/usr/bin/env bash
+# agent-compat: run external coding-agent CLIs (Claude Code, Gemini CLI, Antigravity)
+# against termchart via agent-generator's benchmark-runner, and verify each actually
+# pushes a valid board to a local termchart viewer.
+#
+# Design (see README.md):
+#   * A local `termchart serve` viewer is started once; its URL+token are baked into a
+#     `termchart` PATH wrapper so agents run `termchart push` with no credential handling.
+#     (agent-generator's sandbox forwards PATH but NOT arbitrary env, so the wrapper — not
+#     env vars — is how the viewer config reaches the agent.)
+#   * `antigravity` is wrapped to exec `agy --dangerously-skip-permissions` (harness shells
+#     out to a binary literally named `antigravity`; agy's `-p` == `--print`).
+#   * The viewer is CLEARED before each backend so the deterministic "a board landed" check
+#     (a `command` validation objective) is unambiguous and trace-independent — the only way
+#     to fairly score Antigravity, whose harness produces no tool trace.
+#
+# Usage:
+#   run.sh [BACKEND_SET ...]        # default: tc-claude tc-gemini tc-antigravity
+#   CASE_SET=termchart-compat run.sh tc-claude    # single backend, all TC- cases
+set -uo pipefail
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+DEF_DIR="$SCRIPT_DIR/definitions"
+AG_DIR="${AGENT_GENERATOR_DIR:-/home/ivanmkc/agent-generator}"
+CASE_SET="${CASE_SET:-termchart-compat}"
+BACKENDS=("$@"); [ ${#BACKENDS[@]} -eq 0 ] && BACKENDS=(tc-claude tc-gemini tc-antigravity)
+
+# NOTE: must live under $HOME, NOT /tmp — agent-generator's sandbox mounts a fresh
+# tmpfs over /tmp, which would hide the PATH wrappers from the agent (it would fall
+# through to a global `termchart` with no viewer configured). $HOME is ro-bound into
+# the sandbox via `--ro-bind /`, so an absolute wrapper path there resolves.
+mkdir -p "$HOME/.cache"
+WORK="$(mktemp -d "$HOME/.cache/agent-compat.XXXXXX")"
+BIN="$WORK/bin"; mkdir -p "$BIN"
+SERVE_LOG="$WORK/serve.log"
+SERVE_PID=""
+
+log(){ printf '\033[1;36m[agent-compat]\033[0m %s\n' "$*" >&2; }
+err(){ printf '\033[1;31m[agent-compat] ERROR:\033[0m %s\n' "$*" >&2; }
+
+cleanup(){ [ -n "$SERVE_PID" ] && kill "$SERVE_PID" 2>/dev/null; rm -rf "$WORK" 2>/dev/null; }
+trap cleanup EXIT
+
+# --- 0. locate tools -------------------------------------------------------
+# Use the repo's OWN built CLI, not a global install: `termchart serve` needs a
+# co-located packages/viewer build (the published @ivanmkc/termchart-viewer is
+# not on npm), so a global CLI can't start a viewer.
+REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)"
+REAL_TC="$REPO_ROOT/packages/cli/dist/cli.js"
+if [ ! -f "$REAL_TC" ] || [ ! -f "$REPO_ROOT/packages/viewer/dist/server.js" ]; then
+  log "building termchart CLI + viewer (first run) ..."
+  ( cd "$REPO_ROOT" && npm install && \
+    npm run build --workspace @ivanmkc/termchart-viewer && \
+    npm run build --workspace @ivanmkc/termchart ) >/dev/null 2>&1 \
+    || { err "failed to build termchart CLI/viewer in $REPO_ROOT"; exit 3; }
+fi
+[ -f "$REAL_TC" ] || { err "termchart CLI not built at $REAL_TC"; exit 3; }
+NODE_BIN="$(dirname "$(command -v node || echo /usr/bin/node)")"
+
+if command -v benchmark-runner >/dev/null 2>&1; then
+  RUNNER=(benchmark-runner)
+elif command -v uv >/dev/null 2>&1; then
+  RUNNER=(uv run --project "$AG_DIR" benchmark-runner)
+else
+  err "benchmark-runner not found and uv unavailable. Install agent-generator: (cd $AG_DIR && uv tool install --prerelease allow .)"
+  exit 3
+fi
+log "runner: ${RUNNER[*]}"
+
+for c in claude gemini agy; do
+  command -v "$c" >/dev/null 2>&1 || err "warning: '$c' not on PATH — its backend will fail to launch"
+done
+
+# --- 1. start the local viewer --------------------------------------------
+log "starting termchart serve ..."
+"$REAL_TC" serve >"$SERVE_LOG" 2>&1 &
+SERVE_PID=$!
+VURL=""; VTOK=""
+for _ in $(seq 1 60); do
+  VURL="$(grep -oE 'TERMCHART_VIEWER_URL=[^[:space:]]+' "$SERVE_LOG" | head -1 | cut -d= -f2-)"
+  VTOK="$(grep -oE 'TERMCHART_VIEWER_TOKEN=[^[:space:]]+' "$SERVE_LOG" | head -1 | cut -d= -f2-)"
+  [ -n "$VURL" ] && [ -n "$VTOK" ] && break
+  kill -0 "$SERVE_PID" 2>/dev/null || { err "termchart serve exited early:"; cat "$SERVE_LOG" >&2; exit 3; }
+  sleep 1
+done
+[ -z "$VURL" ] && { err "timed out waiting for viewer URL. serve log:"; cat "$SERVE_LOG" >&2; exit 3; }
+log "viewer ready: $VURL"
+
+# --- 2. PATH wrappers ------------------------------------------------------
+# termchart: inject viewer URL/token (the sandbox does not forward these env vars).
+cat > "$BIN/termchart" <<EOF
+#!/usr/bin/env bash
+export PATH="$NODE_BIN:\$PATH"
+export TERMCHART_VIEWER_URL="$VURL"
+export TERMCHART_VIEWER_TOKEN="$VTOK"
+exec "$REAL_TC" "\$@"
+EOF
+# antigravity: the harness shells out to a binary named 'antigravity'; map it to agy.
+cat > "$BIN/antigravity" <<EOF
+#!/usr/bin/env bash
+exec agy --dangerously-skip-permissions "\$@"
+EOF
+chmod +x "$BIN/termchart" "$BIN/antigravity"
+
+# --- 3. env for benchmark-runner + the sandboxed agents --------------------
+export PATH="$BIN:$PATH"                       # wrappers take precedence; forwarded to sandbox
+export TERMCHART_VIEWER_URL="$VURL" TERMCHART_VIEWER_TOKEN="$VTOK"  # for the verify `command`
+export ALLOW_NO_BWRAP="${ALLOW_NO_BWRAP:-1}"   # already containerised; skip bwrap if absent
+export NO_COLOR=1
+# Vertex routing (matches this machine's working GEPA config); creds via ADC.
+export CLAUDE_CODE_USE_VERTEX="${CLAUDE_CODE_USE_VERTEX:-1}"
+export ANTHROPIC_VERTEX_PROJECT_ID="${ANTHROPIC_VERTEX_PROJECT_ID:-adk-coding-agents}"
+export CLOUD_ML_REGION="${CLOUD_ML_REGION:-global}"
+export GOOGLE_GENAI_USE_VERTEXAI="${GOOGLE_GENAI_USE_VERTEXAI:-true}"
+export GOOGLE_CLOUD_PROJECT="${GOOGLE_CLOUD_PROJECT:-adk-coding-agents}"
+export GOOGLE_CLOUD_LOCATION="${GOOGLE_CLOUD_LOCATION:-global}"
+
+# Preflight: agy authenticates via a login in $HOME, but the sandbox forces HOME=/tmp,
+# so agy can't auth inside a case (Claude/Gemini forward creds via env; agy doesn't).
+# Detect this up front and SKIP tc-antigravity rather than report a misleading FAIL.
+agy_usable_in_sandbox(){
+  command -v agy >/dev/null 2>&1 || return 1
+  local out; out="$(HOME="$(mktemp -d)" timeout 45 agy --dangerously-skip-permissions -p "reply OK" 2>&1)"
+  ! grep -qiE 'authentication failed|not authenticated|please (log|sign) in|timed out' <<<"$out"
+}
+
+# --- 4. per-backend loop (clear viewer -> run -> record) -------------------
+declare -A RESULT
+for b in "${BACKENDS[@]}"; do
+  log "=== backend: $b ==="
+  if [ "$b" = "tc-antigravity" ] && ! agy_usable_in_sandbox; then
+    RESULT[$b]="SKIP (agy auth not available under sandbox HOME=/tmp — see README)"
+    log "backend $b -> ${RESULT[$b]}"
+    continue
+  fi
+  "$BIN/termchart" clear --all >/dev/null 2>&1 || log "warn: viewer clear failed (continuing)"
+  if "${RUNNER[@]}" \
+        --config-dir "$DEF_DIR" \
+        --case-set "$CASE_SET" \
+        --generator-set "$b" \
+        --name "compat-$b" \
+        --user agent-compat \
+        --concurrency 1 \
+        --require-all-pass; then
+    RESULT[$b]="PASS"
+  else
+    RESULT[$b]="FAIL (exit $?)"
+  fi
+  log "backend $b -> ${RESULT[$b]}"
+done
+
+# --- 5. summary ------------------------------------------------------------
+echo
+echo "================ termchart agent-compat summary ================"
+printf '%-16s %s\n' "BACKEND" "RESULT"
+rc=0
+for b in "${BACKENDS[@]}"; do
+  printf '%-16s %s\n' "$b" "${RESULT[$b]}"
+  [[ "${RESULT[$b]}" == FAIL* ]] && rc=1   # SKIP does not fail the run
+done
+echo "case-set: $CASE_SET   viewer: $VURL"
+echo "==============================================================="
+exit $rc
diff --git a/scripts/experiments/agent-compat/run_locally.sh b/scripts/experiments/agent-compat/run_locally.sh
new file mode 100755
index 0000000..54bacbd
--- /dev/null
+++ b/scripts/experiments/agent-compat/run_locally.sh
@@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+# Run the agent-compat workflow locally via `act` + podman (mirrors
+# agent-generator/.github/scripts/run_eval_locally.sh). Bind-mounts the host's
+# ADC and the agy (Antigravity) binary, which are not available in a fresh runner.
+#
+# Prereqs: act (>=0.2.87), podman (rootless socket active), gcloud ADC, and the
+# host CLIs (claude, gemini, agy, termchart) — or let the workflow npm-install the
+# public ones. Simpler alternative for iterating: run run.sh directly (no act).
+set -euo pipefail
+
+REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)"
+ADC="$HOME/.config/gcloud/application_default_credentials.json"
+BACKENDS="${1:-tc-claude tc-gemini tc-antigravity}"
+
+[ -f "$ADC" ] || { echo "Missing ADC at $ADC — run: gcloud auth application-default login"; exit 1; }
+
+# Point act/docker at the rootless podman socket if DOCKER_HOST is unset.
+export DOCKER_HOST="${DOCKER_HOST:-unix:///run/user/$(id -u)/podman/podman.sock}"
+
+EVENT="$(mktemp)"
+cat > "$EVENT" <<JSON
+{ "inputs": { "backends": "$BACKENDS", "case_set": "${CASE_SET:-termchart-compat}" } }
+JSON
+
+# --privileged so the sandbox can set up bwrap/userns (or set ALLOW_NO_BWRAP=1 in run.sh).
+# Bind-mount host ADC, the agy binary, and agent-generator into the runner.
+exec act workflow_dispatch \
+  -W "$REPO_ROOT/.github/workflows/agent-compat.yml" \
+  -e "$EVENT" \
+  --privileged \
+  --container-options "\
+    -v $HOME/.config/gcloud:/root/.config/gcloud:ro \
+    -v $(command -v agy):/usr/local/bin/agy:ro \
+    -v /home/ivanmkc/agent-generator:/home/ivanmkc/agent-generator:ro" \
+  --env ACT=true \
+  --env GOOGLE_APPLICATION_CREDENTIALS=/root/.config/gcloud/application_default_credentials.json \
+  "$@"

From 35ebe38713d4433e45ecec9206d6a3e7650be5d9 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivanmkc@google.com>
Date: Sat, 4 Jul 2026 01:56:16 +0000
Subject: [PATCH 2/7] =?UTF-8?q?agent-compat:=20re-run=20vs=20merged=20#220?=
 =?UTF-8?q?=20--type=20validation=20=E2=80=94=20feedback=20loop=20works,?=
 =?UTF-8?q?=20journey=20selection=20still=20the=20gap?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

#219 (push --type validation) merged into master. Re-ran journeys: agents now
get an actionable error on an invalid --type and retry with a VALID type every
time (no more silently-stored unrenderable boards — finding #2 fixed). Scores
stay ~1/5 because agents retry to a valid-but-wrong type (Claude falls back to
mermaid); picking the RIGHT journey is a recipe-activation gap, not validation.
---
 scripts/experiments/agent-compat/RESULTS.md | 33 +++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md
index 17244a3..358fe05 100644
--- a/scripts/experiments/agent-compat/RESULTS.md
+++ b/scripts/experiments/agent-compat/RESULTS.md
@@ -67,3 +67,36 @@ valid types and said "do not invent names" — behaviour did not change.)
   `get_sandbox_auth`) to score Antigravity.
 - Optionally mount the termchart plugin into Claude's sandbox to test its real-world
   (plugin-equipped) behaviour vs the AGENTS.md-only condition measured here.
+
+## Re-run against #219 (merged 2026-07-04) — `push` now validates `--type`
+
+PR #219 ("close the push→display validation gap") landed the fix recommended above:
+`validateContent` now rejects unknown board types at push time (CLI offline fast-fail +
+server 400) with the supported-type list. Re-ran the journey suite on merged master.
+
+**Scores:** Claude 1/5, Gemini 1/5 (metrics passes both; ±1 run-to-run variance — agents
+are stochastic at K=1). Journey-activation scores did not jump — but the **failure mode
+changed for the better**, which is the point of #219.
+
+**The feedback loop works (Claude transcripts):** every invalid `--type` now returns
+`unknown type "X" — Supported types: …`, and Claude **retries with a valid type every
+time** instead of silently storing an unrenderable board:
+
+| scenario | attempts | outcome |
+|---|---|---|
+| compare → component | `comparison` → ERR → `vegalite` | valid, wrong journey |
+| architecture → flow | `architecture` → ERR → `flow` ✅ → `mermaid` | got flow, then overwrote |
+| metrics → vegalite | `line` → ERR → `vegalite` | ✅ correct |
+| ER → flow | `sql` → ERR → `mermaid` | valid, wrong journey |
+| dashboard → panes | `graph` → ERR → `mermaid` | valid, wrong journey |
+
+**Takeaways:**
+- **#219 fixes finding #2 (silent invalid-type acceptance).** No unrenderable boards land
+  anymore; agents get actionable feedback and self-correct to a *valid* type.
+- **The remaining gap is journey selection, not validation.** Given the valid set, agents
+  often pick a valid-but-wrong type — Claude repeatedly falls back to `mermaid` (the
+  lowest-common-denominator) rather than the journey's type (component/flow/panes). That's
+  a recipe-activation problem: the error lists valid types but not which fits the intent.
+- **Actionable next step:** have the reject message (or AGENTS.md/plugin) map intent→type
+  ("comparisons → component", "schema/architecture → flow"), so the self-correction lands
+  on the *right* journey, not just a renderable one. Then re-run.

From b2bae511a7952b3cf3aaf2dfcaf9741a04f1925b Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivanmkc@google.com>
Date: Sat, 4 Jul 2026 02:08:17 +0000
Subject: [PATCH 3/7] =?UTF-8?q?agent-compat:=20intent-hint=20rerun=20?=
 =?UTF-8?q?=E2=80=94=20journey=20activation=200/5->4/5=20(Claude),=202/5->?=
 =?UTF-8?q?3/5=20(Gemini)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The #229 did-you-mean/intent hint closes the journey-selection gap: agents now
self-correct to the RIGHT type (comparison->component, architecture->flow, etc.),
not just a valid one. Only ER residual (agents prefer mermaid's native erDiagram).
---
 scripts/experiments/agent-compat/RESULTS.md | 26 +++++++++++++++++++++
 1 file changed, 26 insertions(+)

diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md
index 358fe05..cda20a2 100644
--- a/scripts/experiments/agent-compat/RESULTS.md
+++ b/scripts/experiments/agent-compat/RESULTS.md
@@ -100,3 +100,29 @@ time** instead of silently storing an unrenderable board:
 - **Actionable next step:** have the reject message (or AGENTS.md/plugin) map intent→type
   ("comparisons → component", "schema/architecture → flow"), so the self-correction lands
   on the *right* journey, not just a renderable one. Then re-run.
+
+## Re-run with the intent→type hint (PR #229) — the journey-selection gap closes
+
+#219 made agents self-correct to a *valid* type; the residual was they picked a valid-but-
+**wrong** type (Claude falling back to `mermaid`). PR #229 adds the missing nudge: a
+`Did you mean "component"?` + one-line intent→type guide in the reject message, and a
+"Push --type" block in `termchart --help` listing the rich types. Re-ran the journey suite:
+
+| backend | before #219 | with #219 | **with #229 hint** |
+|---|---|---|---|
+| Claude Code | 0/5 | 1/5 | **4/5** |
+| Gemini CLI | 2/5 | 1/5 | **3/5** |
+
+Per-case (with #229): compare→component, architecture→flow, metrics→vegalite, dashboard→
+panes/component all **PASS** for Claude (Gemini passes arch/metrics/dashboard). The
+feedback loop now lands on the *right* journey: e.g. Claude `comparison` → ERR("did you
+mean component") → **component** ✅.
+
+**Remaining residual — ER (both fail):** for a DB schema, agents reach for `--type mermaid`
+(Mermaid has a native `erDiagram`), which is a *valid* type → no error → no hint → lands as
+`mermaid` instead of the rich `flow` entity-node journey. Options if we want ER too: teach
+the guide to prefer `flow` for schemas, or accept a mermaid erDiagram as a valid ER journey.
+
+**Net:** the harness drove three concrete termchart fixes — #219 (reject invalid types),
+#229 (guide to the right type) — taking journey activation from 0/5 to 4/5 (Claude). The
+tests live here (`definitions/` + `run.sh`); the product fixes are separate PRs.

From 7bc848c4d7b53dbda60e755db970600fc011bdd2 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivanmkc@google.com>
Date: Sat, 4 Jul 2026 03:34:26 +0000
Subject: [PATCH 4/7] agent-compat: install termchart's bundled skills into the
 sandbox via npx skills
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Agents previously only had the seeded AGENTS.md — termchart's real skills
(diagram-recipes etc.) ship as a plugin under ~/.claude/plugins, which the
simulator sandbox does NOT mount. Install them with the Vercel `npx skills add
plugin/skills -g --all` CLI (same mechanism agents-cli setup uses) so they land
in the canonical ~/.agents/skills store with the per-agent symlinks the sandbox
mounts. Workflow also runs `agents-cli setup --skip-auth` for the general bundle.
---
 .github/workflows/agent-compat.yml      | 12 ++++++++++++
 scripts/experiments/agent-compat/run.sh | 22 ++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/.github/workflows/agent-compat.yml b/.github/workflows/agent-compat.yml
index f19d0c8..be524b2 100644
--- a/.github/workflows/agent-compat.yml
+++ b/.github/workflows/agent-compat.yml
@@ -53,6 +53,18 @@ jobs:
           fi
           echo "$HOME/.local/bin" >> "$GITHUB_PATH"
 
+      - name: Install skills into the coding agents (agents-cli setup)
+        run: |
+          # Installs skills into ~/.claude/skills, ~/.gemini/extensions, ~/.agents/skills —
+          # the locations the simulator sandbox mounts. termchart's own skills are then
+          # installed on top by run.sh (from plugin/skills/), since termchart ships them as
+          # a plugin that doesn't land in those dirs.
+          if command -v agents-cli >/dev/null 2>&1; then
+            agents-cli setup --skip-auth || echo "::warning::agents-cli setup failed (continuing; run.sh still installs termchart skills)"
+          else
+            echo "::warning::agents-cli not found; skipping general skill setup (run.sh still installs termchart skills)"
+          fi
+
       # Hosted CI: authenticate to GCP via WIF (skipped under act — uses mounted ADC).
       - name: GCP auth (CI only)
         if: ${{ !env.ACT }}
diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh
index 820bc0e..bc473b4 100755
--- a/scripts/experiments/agent-compat/run.sh
+++ b/scripts/experiments/agent-compat/run.sh
@@ -115,6 +115,28 @@ export GOOGLE_GENAI_USE_VERTEXAI="${GOOGLE_GENAI_USE_VERTEXAI:-true}"
 export GOOGLE_CLOUD_PROJECT="${GOOGLE_CLOUD_PROJECT:-adk-coding-agents}"
 export GOOGLE_CLOUD_LOCATION="${GOOGLE_CLOUD_LOCATION:-global}"
 
+# --- 3b. install termchart's bundled skills into the sandbox-mounted skill stores ----
+# termchart ships its skills (diagram-recipes etc.) as a plugin, which lands in
+# ~/.claude/plugins — a path the agent-generator sandbox does NOT mount. The sandbox
+# DOES mount ~/.agents/skills (shared, gemini) and ~/.claude/skills (claude), so install
+# the plugin skills there and the agents get termchart's REAL recipe knowledge in-sandbox
+# (not just the seeded AGENTS.md). Mirrors how `agents-cli setup` installs ADK skills.
+install_termchart_skills(){
+  local src="$REPO_ROOT/plugin/skills"
+  [ -d "$src" ] || { log "warn: no plugin/skills at $src — agents rely on AGENTS.md only"; return; }
+  # Install via the Vercel `npx skills` CLI (the same mechanism `agents-cli setup` uses): it
+  # places skills in the canonical global store (~/.agents/skills) with per-agent symlinks
+  # (~/.claude/skills, ~/.gemini/extensions) — exactly the dirs the sandbox mounts (it mounts
+  # BOTH the store and the per-agent dir, so the relative symlinks resolve in-sandbox).
+  npx -y skills add "$src" -g --all >/dev/null 2>&1 || true  # nonzero for unrelated agents (Eve/…)
+  if [ -d "$HOME/.agents/skills/diagram-recipes" ]; then
+    log "installed termchart skills via npx skills: $(ls "$src" | tr '\n' ' ')"
+  else
+    log "warn: termchart skills not installed (npx skills add failed) — agents fall back to AGENTS.md"
+  fi
+}
+install_termchart_skills
+
 # Preflight: agy authenticates via a login in $HOME, but the sandbox forces HOME=/tmp,
 # so agy can't auth inside a case (Claude/Gemini forward creds via env; agy doesn't).
 # Detect this up front and SKIP tc-antigravity rather than report a misleading FAIL.

From c847218ecaa0aa6c5bf782afcbf4d375b51dcdb7 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivanmkc@google.com>
Date: Sat, 4 Jul 2026 03:38:04 +0000
Subject: [PATCH 5/7] agent-compat: real skills installed (npx skills) but
 agents don't auto-activate them
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Installing termchart's skills into the sandbox (diagram-recipes etc.) did NOT
improve journey activation (Claude 1/5, Gemini 2/5) — transcripts show Claude
never read the skill (read-skill=False) and kept using Mermaid keywords. Passive
availability != activation. The #229 in-error did-you-mean hint (4/5) remains the
effective lever because it lands where the agent is already looking.
---
 scripts/experiments/agent-compat/RESULTS.md | 28 +++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md
index cda20a2..2c0dc9e 100644
--- a/scripts/experiments/agent-compat/RESULTS.md
+++ b/scripts/experiments/agent-compat/RESULTS.md
@@ -126,3 +126,31 @@ the guide to prefer `flow` for schemas, or accept a mermaid erDiagram as a valid
 **Net:** the harness drove three concrete termchart fixes — #219 (reject invalid types),
 #229 (guide to the right type) — taking journey activation from 0/5 to 4/5 (Claude). The
 tests live here (`definitions/` + `run.sh`); the product fixes are separate PRs.
+
+## Re-run with termchart's real skills installed in the sandbox (npx skills)
+
+run.sh now installs termchart's bundled skills (diagram-recipes, inbox-watch, termchart)
+into the sandbox-mounted stores via `npx skills add plugin/skills -g --all` (the Vercel
+skills CLI, same mechanism `agents-cli setup` uses) — so agents have termchart's REAL
+recipe knowledge in-sandbox, not just the seeded AGENTS.md. Tested on master (#219's reject,
+**without** the separate #229 hint) to isolate the skills' effect.
+
+| config | Claude | Gemini |
+|---|---|---|
+| #219 only (no skills, no hint) | 1/5 | 1/5 |
+| **#219 + real skills installed (no hint)** | **1/5** | **2/5** |
+| #219 + #229 hint (no skills) | 4/5 | 3/5 |
+
+**Finding: passively installing the skill does NOT improve journey activation.** The
+transcripts show Claude never opened `diagram-recipes` (`read-skill=False`) and kept using
+Mermaid keywords (`flowchart`, `erDiagram`, `mermaid`). Headless agents don't spontaneously
+activate a mounted skill from a plain scenario prompt (the agent-generator skill-activation
+cases only get activation when the prompt explicitly says "check what skills are available
+and use X"). So availability ≠ activation.
+
+**Takeaway:** in-the-moment feedback at the point of failure (#229's did-you-mean in the
+reject message, 4/5) beats passively-available skills (1–2/5) for steering agents to the
+right journey — because the hint lands in the tool output the agent is already reading.
+The skills install is still worth keeping (it's how a faithful environment is set up, and
+it helps when a prompt/slash-command does invoke the skill), but it is not the lever that
+closes the gap.

From f584de2ad8428ef2b1ec5ce90eeb2a133cff3a47 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivanmkc@google.com>
Date: Sat, 4 Jul 2026 03:55:42 +0000
Subject: [PATCH 6/7] =?UTF-8?q?agent-compat:=20correct=20finding=20?=
 =?UTF-8?q?=E2=80=94=20skills=20activate,=20but=20the=20TERMINAL=20termcha?=
 =?UTF-8?q?rt=20skill=20mis-activates?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Traces show Claude DOES call the Skill tool; it activates the 'termchart' terminal
(Mermaid->ASCII) skill, not diagram-recipes, and gets anchored on Mermaid types.
Removing it and leaving only diagram-recipes still drifts to mermaid because the
Mermaid-terminal identity pervades termchart (CLI --help tagline etc.). run.sh now
installs only the viewer skill; the durable fix is deprecating the terminal surface.
---
 scripts/experiments/agent-compat/RESULTS.md | 18 ++++++++++++++++++
 scripts/experiments/agent-compat/run.sh     | 18 +++++++++++++++---
 2 files changed, 33 insertions(+), 3 deletions(-)

diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md
index 2c0dc9e..e8351aa 100644
--- a/scripts/experiments/agent-compat/RESULTS.md
+++ b/scripts/experiments/agent-compat/RESULTS.md
@@ -154,3 +154,21 @@ right journey — because the hint lands in the tool output the agent is already
 The skills install is still worth keeping (it's how a faithful environment is set up, and
 it helps when a prompt/slash-command does invoke the skill), but it is not the lever that
 closes the gap.
+
+## Correction: skills DO activate — the TERMINAL skill mis-activates (root cause)
+
+My earlier "read-skill=False" was a detection bug. Traces show Claude calls the `Skill`
+tool on every case. But it activates the **`termchart` terminal skill** ("render Mermaid →
+ASCII"), not `diagram-recipes` (the viewer-push journey guide) — the literal name "termchart"
+wins for "the termchart viewer". That skill anchors Claude on Mermaid types (`flowchart`,
+`erDiagram`, `line`), so journey activation stays ~1/5.
+
+Removing the terminal skill and installing ONLY `diagram-recipes` did NOT fix it either
+(Claude still drifted to `graph`/`erd`/`mermaid`): the Mermaid-terminal identity is baked
+into termchart itself — the CLI `--help` tagline is literally "deterministic Mermaid → ASCII
+for terminals". So the whole terminal framing keeps pulling agents toward Mermaid.
+
+**Conclusion:** the durable fix isn't a hint or a skill tweak — it's to **deprecate
+termchart's terminal (Mermaid→ASCII) surface and present it as a viewer-first tool**, so the
+only vocabulary agents ever see is the rich viewer types. (The #229 in-error hint remains a
+good belt-and-suspenders correction: 4/5.)
diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh
index bc473b4..c7d5d08 100755
--- a/scripts/experiments/agent-compat/run.sh
+++ b/scripts/experiments/agent-compat/run.sh
@@ -128,11 +128,23 @@ install_termchart_skills(){
   # places skills in the canonical global store (~/.agents/skills) with per-agent symlinks
   # (~/.claude/skills, ~/.gemini/extensions) — exactly the dirs the sandbox mounts (it mounts
   # BOTH the store and the per-agent dir, so the relative symlinks resolve in-sandbox).
-  npx -y skills add "$src" -g --all >/dev/null 2>&1 || true  # nonzero for unrelated agents (Eve/…)
+  #
+  # Install ONLY the VIEWER-PUSH skill (diagram-recipes). The bundled `termchart` skill is for
+  # TERMINAL ASCII/Mermaid rendering — when present, agents activate it for "show on the viewer"
+  # tasks (it's the literal name match) and get anchored on Mermaid diagram types (flowchart /
+  # erDiagram), which is the wrong vocabulary for a rich viewer push. Remove it if a prior run
+  # (or the bundle) installed it, so diagram-recipes is the skill that activates.
+  npx -y skills remove termchart -g -y >/dev/null 2>&1 || true
+  rm -rf "$HOME/.agents/skills/termchart" "$HOME/.claude/skills/termchart" "$HOME/.gemini/extensions/termchart" 2>/dev/null
+  local want=(diagram-recipes inbox-watch)
+  for s in "${want[@]}"; do
+    [ -f "$src/$s/SKILL.md" ] || continue
+    npx -y skills add "$src/$s" -g --all >/dev/null 2>&1 || true  # nonzero for unrelated agents (Eve/…)
+  done
   if [ -d "$HOME/.agents/skills/diagram-recipes" ]; then
-    log "installed termchart skills via npx skills: $(ls "$src" | tr '\n' ' ')"
+    log "installed termchart viewer skill(s) via npx skills: ${want[*]} (terminal 'termchart' skill excluded)"
   else
-    log "warn: termchart skills not installed (npx skills add failed) — agents fall back to AGENTS.md"
+    log "warn: diagram-recipes not installed (npx skills add failed) — agents fall back to AGENTS.md"
   fi
 }
 install_termchart_skills

From db7f86e7521e27c4e864614aa5267283d4c69315 Mon Sep 17 00:00:00 2001
From: Ivan Cheung <ivanmkc@google.com>
Date: Sat, 4 Jul 2026 08:05:33 +0000
Subject: [PATCH 7/7] agent-compat: agents one-shot 5/5 once the guide reaches
 them (CLAUDE.md) + self-contained shapes

Root cause of Claude's low journey scores was NOT skill quality/model: Claude Code reads
CLAUDE.md, but the harness seeded the guide only as AGENTS.md (Gemini's file). Claude ran
blind and fell back to mermaid 'graph LR' under --type flow. Fixes: seed CLAUDE.md too +
make the guide self-contained (exact per-type JSON + 'flow content is JSON not mermaid').
Now one-shot with NO error hint: Claude haiku 5/5, Claude sonnet 5/5, Gemini 5/5
(was 1/5, ~1/5, 2/5). Adds tc_claude_sonnet generator + SKIP_SKILL_INSTALL for A/B.
---
 scripts/experiments/agent-compat/RESULTS.md   | 28 +++++++++++++++
 .../definitions/cases/termchart_journeys.yaml | 35 ++++++++++++++-----
 .../definitions/generator_sets.yaml           |  4 +++
 .../agent-compat/definitions/generators.yaml  |  6 ++++
 scripts/experiments/agent-compat/run.sh       |  3 ++
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md
index e8351aa..5b6787f 100644
--- a/scripts/experiments/agent-compat/RESULTS.md
+++ b/scripts/experiments/agent-compat/RESULTS.md
@@ -172,3 +172,31 @@ for terminals". So the whole terminal framing keeps pulling agents toward Mermai
 termchart's terminal (Mermaid→ASCII) surface and present it as a viewer-first tool**, so the
 only vocabulary agents ever see is the rich viewer types. (The #229 in-error hint remains a
 good belt-and-suspenders correction: 4/5.)
+
+## One-shot breakthrough — the guide wasn't reaching Claude (AGENTS.md vs CLAUDE.md)
+
+The real root cause of Claude's low scores was NOT skill quality or the model: **Claude Code
+reads `CLAUDE.md`, but the harness seeded the guide only as `AGENTS.md`** (which Gemini reads).
+Proof: none of the guide's distinctive strings appeared in Claude's trace. So Claude ran blind
+and fell back to its "flowchart = mermaid `graph LR`" prior — picking `--type flow` correctly
+but writing Mermaid content under it (`invalid flow JSON: Unexpected token 'g', "graph LR"`).
+
+Two fixes:
+1. Seed the guide as **`CLAUDE.md`** too (not just `AGENTS.md`).
+2. Make the guide **self-contained**: exact copy-pasteable JSON per type + explicit
+   "`--type flow`/`component`/`vegalite` content is JSON, never Mermaid `graph` syntax."
+
+Result — **one-shot, no error hint, terminal skill removed**:
+
+| backend | before (AGENTS.md only) | **after (CLAUDE.md + self-contained shapes)** |
+|---|---|---|
+| Claude Haiku  | 1/5 | **5/5** |
+| Claude Sonnet | ~1/5 | **5/5** |
+| Gemini Flash  | 2/5 | **5/5** |
+
+**Takeaway:** the skill genuinely IS the hint — a good, self-contained guide that actually
+reaches the agent makes it one-shot the right journey with zero error-correction. The #229
+reactive hint is now redundant when the guide lands. Two lessons for the product: (a) termchart's
+cross-tool guidance must reach each agent's native memory file (`CLAUDE.md` for Claude,
+`AGENTS.md`/`GEMINI.md` for others), and (b) the diagram-recipes skill should carry the exact
+per-type JSON inline so an agent never has to open a second file (and never bails to mermaid).
diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml
index 4b92845..64e1e77 100644
--- a/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml
+++ b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml
@@ -35,6 +35,12 @@ benchmarks:
             Do NOT invent other names (NOT "comparison", "erd", "architecture", "line",
             "graph", "chart"). Map your intent to one of the valid types below.
 
+            CRITICAL: the content for flow / component / vegalite / panes is **JSON** (see the
+            shapes below). Do NOT write Mermaid syntax (`graph TB`, `sequenceDiagram`, `erDiagram`)
+            under those types — that is a JSON parse error. Mermaid syntax ONLY goes with
+            `--type mermaid`, which is a last resort. Always run `termchart push …` (with the
+            `push` subcommand and `--description`), never a bare `termchart --project …`.
+
             Choose the type by intent:
             - Compare options / products / "which should I pick"  -> component
               (cards; each option an Image + Title + outbound Anchor link + key specs)
@@ -46,11 +52,18 @@ benchmarks:
               (grid of component/vegalite tiles) — or one component with stat cards
             - Notes / write-up / explanation -> markdown
 
-            Shapes:
-            - flow:      {"nodes":[{"id":"n1","data":{"label":"..."}}],"edges":[{"id":"e1","source":"n1","target":"n2"}],"direction":"TB"}
-            - component: {"type":"Stack","children":[{"type":"Title","children":"..."},{"type":"Badge","props":{"color":"green"},"children":"OK"}]}
-            - vegalite:  {"title":"...","data":{"values":[...]},"mark":"line","encoding":{"x":{...},"y":{...}}}
-            - panes:     {"layout":"grid","panes":[{"title":"...","type":"vegalite","content":{...}}]}
+            Author the JSON directly from these COMPLETE shapes. Do NOT switch to
+            --type mermaid because you're unsure of a format — the format is right here:
+            - flow (architecture / request flow / ER / state / sequence): nodes + edges, "direction":"TB"
+              {"nodes":[{"id":"lb","data":{"label":"Load balancer"}},{"id":"web","data":{"label":"Web servers"}},{"id":"db","data":{"label":"Postgres"}},{"id":"cache","data":{"label":"Redis"}}],"edges":[{"id":"e1","source":"lb","target":"web"},{"id":"e2","source":"web","target":"db"},{"id":"e3","source":"web","target":"cache"}],"direction":"TB"}
+              (for an ER/class schema give each node "type":"entity" with a "fields" list)
+            - component (comparison / status board): a Mantine {type,props,children} tree
+              {"type":"Stack","children":[{"type":"Title","props":{"order":3},"children":"Compare"},{"type":"SimpleGrid","props":{"cols":3},"children":[{"type":"Card","children":[{"type":"Text","props":{"fw":700},"children":"Option A"},{"type":"Text","children":"key specs"}]}]}]}
+            - vegalite (metric / trend / chart):
+              {"title":"Monthly active users","data":{"values":[{"m":"Jan","u":1000},{"m":"Feb","u":2000},{"m":"Mar","u":4000}]},"mark":"line","encoding":{"x":{"field":"m","type":"ordinal"},"y":{"field":"u","type":"quantitative"}}}
+            - panes (dashboard of tiles; each pane's "content" is a JSON STRING):
+              {"layout":"grid","panes":[{"title":"Latency","type":"vegalite","content":"{\"mark\":\"line\",\"data\":{\"values\":[{\"t\":1,\"ms\":90}]},\"encoding\":{\"x\":{\"field\":\"t\"},\"y\":{\"field\":\"ms\"}}}"},{"title":"Health","type":"component","content":"{\"type\":\"Badge\",\"props\":{\"color\":\"green\"},\"children\":\"OK\"}"}]}
+        - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide }
       verification_steps: []
     agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
     conversations:
@@ -93,7 +106,8 @@ benchmarks:
     complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
     environment_config:
       resources:
-        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide }
       verification_steps: []
     agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
     conversations:
@@ -124,7 +138,8 @@ benchmarks:
     complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
     environment_config:
       resources:
-        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide }
       verification_steps: []
     agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
     conversations:
@@ -154,7 +169,8 @@ benchmarks:
     complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
     environment_config:
       resources:
-        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide }
       verification_steps: []
     agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
     conversations:
@@ -185,7 +201,8 @@ benchmarks:
     complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 }
     environment_config:
       resources:
-        - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide }
+        - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide }
       verification_steps: []
     agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 }
     conversations:
diff --git a/scripts/experiments/agent-compat/definitions/generator_sets.yaml b/scripts/experiments/agent-compat/definitions/generator_sets.yaml
index e483fa7..735e1e4 100644
--- a/scripts/experiments/agent-compat/definitions/generator_sets.yaml
+++ b/scripts/experiments/agent-compat/definitions/generator_sets.yaml
@@ -16,3 +16,7 @@ generator_sets:
   termchart-agents:
     description: "All three coding-agent CLIs vs termchart"
     generators: [tc_claude, tc_gemini, tc_antigravity]
+
+  tc-claude-sonnet:
+    description: "Claude Code (Sonnet — capable model) vs termchart"
+    generators: [tc_claude_sonnet]
diff --git a/scripts/experiments/agent-compat/definitions/generators.yaml b/scripts/experiments/agent-compat/definitions/generators.yaml
index 3d87e63..28d8726 100644
--- a/scripts/experiments/agent-compat/definitions/generators.yaml
+++ b/scripts/experiments/agent-compat/definitions/generators.yaml
@@ -11,6 +11,12 @@ generators:
     model: claude-haiku-4-5@20251001
     backend: claude
 
+  # Capable-model Claude (what real termchart users run — not the cheapest tier).
+  tc_claude_sonnet:
+    type: interactive
+    model: claude-sonnet-4-6
+    backend: claude
+
   tc_gemini:
     type: interactive
     model: gemini-3-flash-preview
diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh
index c7d5d08..c13dd6d 100755
--- a/scripts/experiments/agent-compat/run.sh
+++ b/scripts/experiments/agent-compat/run.sh
@@ -122,6 +122,9 @@ export GOOGLE_CLOUD_LOCATION="${GOOGLE_CLOUD_LOCATION:-global}"
 # the plugin skills there and the agents get termchart's REAL recipe knowledge in-sandbox
 # (not just the seeded AGENTS.md). Mirrors how `agents-cli setup` installs ADK skills.
 install_termchart_skills(){
+  # Set SKIP_SKILL_INSTALL=1 to run against a pre-installed skill store (for A/B experiments
+  # where the bundle is set up externally).
+  [ "${SKIP_SKILL_INSTALL:-}" = 1 ] && { log "SKIP_SKILL_INSTALL=1 — using pre-installed skill store"; return; }
   local src="$REPO_ROOT/plugin/skills"
   [ -d "$src" ] || { log "warn: no plugin/skills at $src — agents rely on AGENTS.md only"; return; }
   # Install via the Vercel `npx skills` CLI (the same mechanism `agents-cli setup` uses): it