From f54fd1bfef100340498614c8f72e68b92f8f1213 Mon Sep 17 00:00:00 2001 From: Ivan Cheung Date: Fri, 3 Jul 2026 23:09:44 +0000 Subject: [PATCH 1/7] agent-compat: journey-activation harness for Claude/Gemini/Antigravity vs termchart Uses agent-generator's benchmark-runner to drive each external coding-agent CLI headless on termchart tasks and verify (via a local viewer, deterministically) whether it activates the right diagram journey for a scenario. - definitions/: smoke cases (can it push) + journey cases (scenario-only, does it pick the right --type). Shared AGENTS.md decision guide; scope-specific, trace-independent verify (fair to Antigravity which has no trace). - run.sh: builds+serves the repo viewer, PATH-wraps termchart (inject viewer creds) and antigravity->agy, clears per backend, loops backends, preflights+SKIPs agy (its $HOME login can't survive the sandbox's HOME=/tmp). - agent-compat.yml + run_locally.sh: run in CI or locally via act+podman. First results (RESULTS.md): Gemini 2/5, Claude 0/5 journey activation; both mis-use termchart's --type vocabulary and `push` silently accepts invalid types (no corrective feedback). Antigravity SKIP (auth). --- .github/workflows/agent-compat.yml | 72 ++++++ scripts/experiments/agent-compat/README.md | 85 ++++++++ scripts/experiments/agent-compat/RESULTS.md | 69 ++++++ .../agent-compat/definitions/case_sets.yaml | 17 ++ .../definitions/cases/termchart_journeys.yaml | 206 ++++++++++++++++++ .../cases/termchart_signup_flow.yaml | 102 +++++++++ .../cases/termchart_status_component.yaml | 91 ++++++++ .../definitions/generator_sets.yaml | 18 ++ .../agent-compat/definitions/generators.yaml | 23 ++ scripts/experiments/agent-compat/run.sh | 163 ++++++++++++++ .../experiments/agent-compat/run_locally.sh | 37 ++++ 11 files changed, 883 insertions(+) create mode 100644 .github/workflows/agent-compat.yml create mode 100644 scripts/experiments/agent-compat/README.md create mode 100644 scripts/experiments/agent-compat/RESULTS.md create mode 100644 scripts/experiments/agent-compat/definitions/case_sets.yaml create mode 100644 scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml create mode 100644 scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml create mode 100644 scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml create mode 100644 scripts/experiments/agent-compat/definitions/generator_sets.yaml create mode 100644 scripts/experiments/agent-compat/definitions/generators.yaml create mode 100755 scripts/experiments/agent-compat/run.sh create mode 100755 scripts/experiments/agent-compat/run_locally.sh diff --git a/.github/workflows/agent-compat.yml b/.github/workflows/agent-compat.yml new file mode 100644 index 0000000..f19d0c8 --- /dev/null +++ b/.github/workflows/agent-compat.yml @@ -0,0 +1,72 @@ +name: agent-compat + +# Verify external coding-agent CLIs (Claude Code, Gemini CLI, Antigravity) can drive +# the termchart CLI to push valid boards to a live viewer. Orchestrated by +# agent-generator's benchmark-runner; the real logic lives in +# scripts/experiments/agent-compat/ (run.sh + definitions/). +# +# Runnable in CI (dispatch) and locally via act+podman +# (scripts/experiments/agent-compat/run_locally.sh) — cloud-auth steps are ACT-gated. + +on: + workflow_dispatch: + inputs: + backends: + description: "Space-separated backend sets" + default: "tc-claude tc-gemini tc-antigravity" + case_set: + description: "Case set (pattern ^TC-)" + default: "termchart-compat" + +jobs: + compat: + runs-on: ubuntu-latest + env: + AGENT_GENERATOR_DIR: ${{ vars.AGENT_GENERATOR_DIR || '/home/ivanmkc/agent-generator' }} + GOOGLE_CLOUD_PROJECT: ${{ vars.GCP_PROJECT_ID || 'adk-coding-agents' }} + ANTHROPIC_VERTEX_PROJECT_ID: ${{ vars.GCP_PROJECT_ID || 'adk-coding-agents' }} + CASE_SET: ${{ inputs.case_set }} + steps: + - uses: actions/checkout@v4 + + - uses: actions/setup-node@v4 + with: { node-version: "20" } + - uses: actions/setup-python@v5 + with: { python-version: "3.11" } + + - name: Install uv + run: curl -LsSf https://astral.sh/uv/install.sh | sh + + - name: Install agent CLIs + termchart + run: | + npm i -g @anthropic-ai/claude-code @google/gemini-cli @ivanmkc/termchart + # Antigravity (agy) is not public; run_locally.sh bind-mounts the host binary + # under act. In hosted CI the tc-antigravity backend is skipped (agy absent). + + - name: Install benchmark-runner (agent-generator) + run: | + if [ -d "$AGENT_GENERATOR_DIR" ]; then + (cd "$AGENT_GENERATOR_DIR" && uv tool install --prerelease allow .) + else + echo "::warning::AGENT_GENERATOR_DIR not present; provide agent-generator (bind-mount under act, or clone in CI)" + exit 1 + fi + echo "$HOME/.local/bin" >> "$GITHUB_PATH" + + # Hosted CI: authenticate to GCP via WIF (skipped under act — uses mounted ADC). + - name: GCP auth (CI only) + if: ${{ !env.ACT }} + uses: google-github-actions/auth@v3 + with: + workload_identity_provider: ${{ secrets.WIF_PROVIDER }} + service_account: ${{ secrets.WIF_SERVICE_ACCOUNT }} + + # Local act: verify a host ADC was mounted (models are Vertex-routed). + - name: Verify ADC (act only) + if: ${{ env.ACT }} + run: | + test -f "$HOME/.config/gcloud/application_default_credentials.json" \ + || { echo "Mount host ADC: run via run_locally.sh"; exit 1; } + + - name: Run agent-compat + run: bash scripts/experiments/agent-compat/run.sh ${{ inputs.backends }} diff --git a/scripts/experiments/agent-compat/README.md b/scripts/experiments/agent-compat/README.md new file mode 100644 index 0000000..90f0b24 --- /dev/null +++ b/scripts/experiments/agent-compat/README.md @@ -0,0 +1,85 @@ +# agent-compat — do Claude Code, Gemini CLI & Antigravity play nice with termchart? + +Runs each external coding-agent CLI **headless** on termchart tasks and verifies, via a +live viewer, that each one: + +1. **can drive the `termchart` CLI** to push a valid board (smoke cases), and +2. **activates the right journey/recipe** — given a *scenario only* (never told the + diagram type), does it pick the correct termchart type for the job? (journey cases) + +Orchestrated by [agent-generator]'s `benchmark-runner` (interactive-simulation cases, +`output_format: direct` — prompts go straight to the CLI backend). + +## How it works + +``` +run.sh + ├─ build + run the repo's termchart CLI: termchart serve → local viewer, capture URL+token + ├─ PATH wrappers (under $HOME, NOT /tmp — the sandbox tmpfs-masks /tmp): + │ termchart → inject viewer URL/token, exec the repo CLI + │ antigravity → exec `agy --dangerously-skip-permissions` (harness calls `antigravity -p …`; agy -p == --print) + ├─ for each backend {claude, gemini-cli, antigravity}: + │ (antigravity: preflight agy auth under a fresh HOME → SKIP if it can't auth) + │ termchart clear --all → fresh viewer state + │ benchmark-runner --config-dir definitions --case-set --generator-set + └─ summary (PASS / FAIL / SKIP per backend) +``` + +**Why a `termchart` wrapper, not env vars:** agent-generator's sandbox forwards `PATH` +to the agent but not arbitrary env (`TERMCHART_VIEWER_URL/TOKEN`). Baking the viewer +config into a PATH wrapper lets every agent run `termchart push` with zero credential +handling. It must live under `$HOME` (ro-bound into the sandbox), not `/tmp` (the sandbox +mounts a fresh tmpfs over `/tmp`, which would hide it). + +**Verification is deterministic, scope-specific, and trace-independent.** Each case pins +a unique scope (`--project … --agent `) but lets the agent choose the `--type`; +a `command` objective then checks *that scope's* board has the prescribed type via +`termchart list` (e.g. `/compare …[component]`). Scope-specific ⇒ no cross-case +contamination / false passes. Trace-independent ⇒ it's the only fair way to score +**Antigravity**, whose harness produces no tool trace (the `cli_command` objective, which +needs a trace, is `required: false`). + +## Cases (`definitions/cases/`) + +| Case set | Cases | Tests | +|---|---|---| +| `termchart-smoke` | `TC-FLOW-001`, `TC-COMPONENT-001` | Prompt names the type — can the agent drive `termchart push` at all | +| `termchart-journeys` | `TC-JOURNEY-{COMPARE,ARCH,METRICS,ER,DASHBOARD}-001` | **Scenario only** — does the agent pick the right type/journey (component / flow / vegalite / flow / panes\|component) | +| `termchart-compat` | all `TC-*` | smoke + journeys | + +Every case seeds one shared `AGENTS.md` "choose the diagram" decision guide (auto-read by +claude/gemini/agy), so all backends have the same recipe knowledge — the test is whether +they **activate** the right journey, not whether they know termchart exists. + +## Run it + +```bash +scripts/experiments/agent-compat/run.sh # all backends, all cases +CASE_SET=termchart-journeys scripts/experiments/agent-compat/run.sh tc-claude tc-gemini +scripts/experiments/agent-compat/run_locally.sh # via GitHub Actions + act+podman +``` + +## Requirements + +- `claude`, `gemini`, `agy`, `node` on PATH (run.sh warns if any are missing). +- `benchmark-runner` (`cd && uv tool install --prerelease allow .`) or `uv` on PATH. +- GCP ADC — models are Vertex-routed (`gcloud auth application-default login`). +- The repo's termchart CLI + viewer are built automatically on first run. +- `AGENT_GENERATOR_DIR` (default `/home/ivanmkc/agent-generator`). + +## Backend status & caveats + +- **Claude Code, Gemini CLI — supported and verified.** Both drive termchart correctly; + auth is forwarded into the sandbox via env (Vertex). +- **Antigravity (agy) — SKIPPED by default (auth limitation).** agy authenticates via a + login stored in `$HOME`, but agent-generator's sandbox forces `HOME=/tmp`, so agy can't + authenticate inside a case (its creds are neither ADC- nor env-based, so they can't be + forwarded the way Claude/Gemini creds are). run.sh detects this and marks `tc-antigravity` + **SKIP** instead of a misleading FAIL. The wrapper + deterministic check are correct and + will work once agy's auth is available in the sandbox — e.g. by extending agent-generator's + (stub) antigravity harness `get_sandbox_auth` to mount agy's credential dir, or running + agy's backend outside the sandbox. Its harness also extracts no tool trace (stub), so only + the deterministic viewer check applies to it. +- Add a case by dropping a `TC-*` interactive-simulation YAML in `definitions/cases/`. + +[agent-generator]: https://github.com/ivanmkc/agent-generator diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md new file mode 100644 index 0000000..17244a3 --- /dev/null +++ b/scripts/experiments/agent-compat/RESULTS.md @@ -0,0 +1,69 @@ +# agent-compat — first results (2026-07-03) + +Ran the harness against Claude Code (`claude`, haiku-4.5), Gemini CLI (`gemini`, +gemini-3-flash), and Antigravity (`agy`), on this machine (Vertex-routed, local viewer). + +## Smoke (`termchart-smoke`) — "can the agent drive `termchart push` at all" + +Prompt explicitly names the diagram type. + +| Backend | TC-FLOW-001 | TC-COMPONENT-001 | +|---|---|---| +| Claude Code | ✅ | ✅ | +| Gemini CLI | ✅ | ✅ | +| Antigravity | SKIP (auth) | SKIP (auth) | + +Both supported CLIs can author a spec and push it. The basic termchart CLI contract works. + +## Journey activation (`termchart-journeys`) — "does it pick the RIGHT journey" + +Prompt is **scenario only**; the agent must choose the correct termchart `--type`. All +backends get the same `AGENTS.md` decision guide (intent → type). Deterministic, +scope-specific verify (`termchart list` shows the case's board with the prescribed type). + +| Scenario → correct type | Claude Code | Gemini CLI | +|---|---|---| +| compare laptops → `component` | ❌ used `table` / `comparison` | ❌ used `comparison` | +| web-app architecture → `flow` | ❌ used `flowchart` | ❌ used `mermaid` | +| MAU trend → `vegalite` | ❌ used `line` | ✅ `vegalite` | +| blog DB schema → `flow` | ❌ used `erDiagram` / `erd` | ❌ used `er` | +| service dashboard → `panes`/`component` | ❌ used `status` / `graph` | ✅ `panes` | +| **Score** | **0 / 5** | **2 / 5** | + +(Identical across two runs, including one where `AGENTS.md` **explicitly** listed the +valid types and said "do not invent names" — behaviour did not change.) + +## Findings + +1. **Neither agent reliably activates termchart's `--type` vocabulary.** + - **Claude** anchors on **Mermaid / semantic** names (`flowchart`, `erDiagram`, `table`, + `line`, `status`) for every case — likely because `termchart --help` leads with its + "deterministic Mermaid → ASCII" heritage, so it treats termchart as a Mermaid tool. + - **Gemini** gets the termchart-native types right for charts/dashboards + (`vegalite`, `panes`) but falls back to invalid semantic names (`comparison`, `er`) + or `mermaid` for comparison / architecture / ER. + - So the diagram **journeys** (component for comparisons, flow for architecture/ER) are + mostly **not** being activated correctly by either CLI in a scenario-only prompt. + +2. **`termchart push` silently accepts unknown `--type` values.** `push --type comparison` + (or `erDiagram`, `status`, …) exits 0 and stores the board; the viewer then can't render + it. Because there is no error, the agent gets **no corrective feedback** and never + retries with a valid type. This is the single highest-leverage fix: **validate `--type` + in `push` and reject unknown values with the valid list** (`flow | component | vegalite | + panes | markdown | mermaid | calltree`). Agents would see the error and self-correct. + +3. **Antigravity can't be scored here.** `agy` authenticates via a login stored in `$HOME`, + but agent-generator's sandbox forces `HOME=/tmp`; its creds are neither ADC- nor + env-based, so (unlike Claude/Gemini's Vertex env) they can't be forwarded. run.sh + preflights this and reports SKIP. + +## Recommended next steps (not done here) + +- **Fix `termchart push` to validate `--type`** (finding #2) — most impactful for agent + compatibility; makes termchart self-correcting for any agent. Re-run this suite after. +- Make the rich viewer types prominent in `termchart --help` / plugin so Claude stops + defaulting to Mermaid vocabulary. +- Forward agy's auth into the sandbox (extend agent-generator's antigravity harness + `get_sandbox_auth`) to score Antigravity. +- Optionally mount the termchart plugin into Claude's sandbox to test its real-world + (plugin-equipped) behaviour vs the AGENTS.md-only condition measured here. diff --git a/scripts/experiments/agent-compat/definitions/case_sets.yaml b/scripts/experiments/agent-compat/definitions/case_sets.yaml new file mode 100644 index 0000000..90ed91a --- /dev/null +++ b/scripts/experiments/agent-compat/definitions/case_sets.yaml @@ -0,0 +1,17 @@ +kind: case_sets + +case_sets: + termchart-compat: + description: "All termchart CLI compatibility tasks (smoke + journey activation)" + patterns: + - "^TC-" + + termchart-smoke: + description: "Basic push smoke tests (prompt names the type)" + patterns: + - "^TC-(FLOW|COMPONENT)-" + + termchart-journeys: + description: "Journey activation: scenario-only prompts, verify the agent picks the right type" + patterns: + - "^TC-JOURNEY-" diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml new file mode 100644 index 0000000..4b92845 --- /dev/null +++ b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml @@ -0,0 +1,206 @@ +# Journey-activation cases: the prompt describes a SCENARIO only (never the diagram +# type). Each case pins a unique scope (--project tc-journey --agent ) but lets +# the agent CHOOSE the --type, then verifies THAT scope's board has the type the +# journey prescribes. Scope-specific check => no cross-case contamination and no false +# pass from another case's board. Outcome-based => trace-independent, fair across +# Claude / Gemini / Antigravity. +# +# All cases share one AGENTS.md "decision guide" (YAML anchor) so every backend has the +# same recipe knowledge — the test is whether it ACTIVATES the right one. +benchmarks: +- benchmark_type: interactive_simulation + id: TC-JOURNEY-COMPARE-001 + case_data: + name: "journey: compare options -> component" + id: TC-JOURNEY-COMPARE-001 + description: "Shopping/decision scenario should activate the product-comparison journey (component)." + requirements: ["Pick the right termchart diagram type for a comparison and push it."] + complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } + environment_config: + resources: + - type: file_system + name: agents_md + description: termchart decision guide, auto-read by claude/gemini/agy. + path: AGENTS.md + initial_content: &guide | + # termchart — pick the RIGHT diagram for the job + + `termchart` is on PATH; the viewer is pre-configured (no token needed). + Choose the TYPE that best fits the request, author its JSON, push it, confirm: + termchart push --project

--agent --type --description "" --focus + termchart list + + IMPORTANT: --type MUST be exactly one of these termchart types: + flow | component | vegalite | panes | markdown | mermaid + Do NOT invent other names (NOT "comparison", "erd", "architecture", "line", + "graph", "chart"). Map your intent to one of the valid types below. + + Choose the type by intent: + - Compare options / products / "which should I pick" -> component + (cards; each option an Image + Title + outbound Anchor link + key specs) + - System/service architecture, request flow, pipeline, ER schema, state machine -> flow + (nodes+edges, "direction":"TB"; for ER use entity nodes with typed fields) + - A metric / trend / distribution over time or category -> vegalite + (a Vega-Lite spec: title, axis titles, tooltip) + - Dashboard of several independent tiles (latency, errors, uptime) -> panes + (grid of component/vegalite tiles) — or one component with stat cards + - Notes / write-up / explanation -> markdown + + Shapes: + - flow: {"nodes":[{"id":"n1","data":{"label":"..."}}],"edges":[{"id":"e1","source":"n1","target":"n2"}],"direction":"TB"} + - component: {"type":"Stack","children":[{"type":"Title","children":"..."},{"type":"Badge","props":{"color":"green"},"children":"OK"}]} + - vegalite: {"title":"...","data":{"values":[...]},"mark":"line","encoding":{"x":{...},"y":{...}}} + - panes: {"layout":"grid","panes":[{"title":"...","type":"vegalite","content":{...}}]} + verification_steps: [] + agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } + conversations: + - id: compare + name: Compare laptops + initial_prompt: > + I'm choosing a new laptop and can't decide. Compare the MacBook Air M3, + the Dell XPS 13, and the Lenovo ThinkPad X1 Carbon for me — price, weight, + and battery life — and show it on the termchart viewer so I can eyeball it. + Use exactly `--project tc-journey --agent compare` as the scope (you choose + the --type). Then run `termchart list` and report what landed. + simulant_goal: &sg > + Get the agent to choose an appropriate termchart diagram for the scenario and + push it to the given scope, then confirm via termchart list. Do NOT tell it + which type to use. Close as soon as it reports the list output or an error. + simulant_knowledge_system: &sks + static_narrative_facts: ["termchart is on PATH and the viewer is pre-configured."] + relevant_info_registry: [] + conversational_playbook: + - trigger_condition: The agent shows termchart list output or reports the push succeeded. + response: Thanks, that's exactly what I needed. + description: Close once a board is pushed. + - trigger_condition: The agent reports a termchart command error. + response: OK, thanks — please stop here. + description: Close on a terminal error. + validation_objectives: + - type: command + command: 'termchart list 2>/dev/null | grep -qE "/compare[[:space:]].*\[component\]"' + expected_exit_code: 0 + - type: cli_command + commands: [{ pattern: "termchart push", required: false }] + +- benchmark_type: interactive_simulation + id: TC-JOURNEY-ARCH-001 + case_data: + name: "journey: architecture -> flow" + id: TC-JOURNEY-ARCH-001 + description: "Architecture/request-flow scenario should activate a flow journey." + requirements: ["Pick the right termchart diagram type for an architecture and push it."] + complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } + environment_config: + resources: + - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + verification_steps: [] + agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } + conversations: + - id: arch + name: Web app architecture + initial_prompt: > + Help me explain our web app's architecture to a new hire: users hit a load + balancer, which forwards to web servers, which read/write a Postgres database + and a Redis cache. Put it on the termchart viewer using exactly + `--project tc-journey --agent arch` (you choose the --type), then confirm with + `termchart list`. + simulant_goal: *sg + simulant_knowledge_system: *sks + validation_objectives: + - type: command + command: 'termchart list 2>/dev/null | grep -qE "/arch[[:space:]].*\[flow\]"' + expected_exit_code: 0 + - type: cli_command + commands: [{ pattern: "termchart push", required: false }] + +- benchmark_type: interactive_simulation + id: TC-JOURNEY-METRICS-001 + case_data: + name: "journey: metric trend -> vegalite" + id: TC-JOURNEY-METRICS-001 + description: "A metric/trend scenario should activate a vegalite chart journey." + requirements: ["Pick the right termchart diagram type for a metric and push it."] + complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } + environment_config: + resources: + - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + verification_steps: [] + agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } + conversations: + - id: metrics + name: MAU growth + initial_prompt: > + Show me how our monthly active users grew over the last six months — roughly + 1000, 2000, 4000, 6000, 9000, then 12000. I want to see the trend on the + termchart viewer. Use exactly `--project tc-journey --agent metrics` (you + choose the --type), then confirm with `termchart list`. + simulant_goal: *sg + simulant_knowledge_system: *sks + validation_objectives: + - type: command + command: 'termchart list 2>/dev/null | grep -qE "/metrics[[:space:]].*\[vegalite\]"' + expected_exit_code: 0 + - type: cli_command + commands: [{ pattern: "termchart push", required: false }] + +- benchmark_type: interactive_simulation + id: TC-JOURNEY-ER-001 + case_data: + name: "journey: db schema -> flow (entity)" + id: TC-JOURNEY-ER-001 + description: "A database-schema scenario should activate an ER/entity flow journey." + requirements: ["Pick the right termchart diagram type for a schema and push it."] + complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } + environment_config: + resources: + - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + verification_steps: [] + agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } + conversations: + - id: er + name: Blog schema + initial_prompt: > + Sketch the database schema for a simple blog: a users table, a posts table, + and a comments table, with their key fields and how they relate (a user has + many posts; a post has many comments). Put it on the termchart viewer using + exactly `--project tc-journey --agent er` (you choose the --type), then + confirm with `termchart list`. + simulant_goal: *sg + simulant_knowledge_system: *sks + validation_objectives: + - type: command + command: 'termchart list 2>/dev/null | grep -qE "/er[[:space:]].*\[flow\]"' + expected_exit_code: 0 + - type: cli_command + commands: [{ pattern: "termchart push", required: false }] + +- benchmark_type: interactive_simulation + id: TC-JOURNEY-DASHBOARD-001 + case_data: + name: "journey: health dashboard -> panes/component" + id: TC-JOURNEY-DASHBOARD-001 + description: "A multi-tile status scenario should activate a dashboard journey (panes or component)." + requirements: ["Pick the right termchart diagram type for a dashboard and push it."] + complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } + environment_config: + resources: + - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + verification_steps: [] + agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } + conversations: + - id: dashboard + name: Service health dashboard + initial_prompt: > + Give me an at-a-glance health dashboard for three services (auth, payments, + search): show each one's p99 latency, error rate, and uptime. Put it on the + termchart viewer using exactly `--project tc-journey --agent dashboard` (you + choose the --type), then confirm with `termchart list`. + simulant_goal: *sg + simulant_knowledge_system: *sks + validation_objectives: + - type: command + command: 'termchart list 2>/dev/null | grep -qE "/dashboard[[:space:]].*\[(panes|component)\]"' + expected_exit_code: 0 + - type: cli_command + commands: [{ pattern: "termchart push", required: false }] diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml new file mode 100644 index 0000000..d3a212f --- /dev/null +++ b/scripts/experiments/agent-compat/definitions/cases/termchart_signup_flow.yaml @@ -0,0 +1,102 @@ +benchmarks: +- benchmark_type: interactive_simulation + id: TC-FLOW-001 + case_data: + name: "termchart — push a signup flowchart" + id: TC-FLOW-001 + description: > + Verify a coding-agent CLI can drive the termchart CLI to push a React-Flow + flowchart to the live viewer. The viewer is pre-configured via a PATH wrapper + (no token handling needed by the agent). Pass = a flow board actually lands. + requirements: + - Use the termchart CLI to push a flow diagram to the viewer. + complexity: + architecture: ARCH_SINGLE + mutation_risk: RISK_READ_ONLY + tool_depth: DEPTH_ATOMIC + environment: ENV_LOCAL_FILES + adk_api_tier: ADK_T1 + environment_config: + resources: + - type: file_system + name: agents_md + description: Cross-tool termchart CLI reference, auto-read by claude/gemini/agy. + path: AGENTS.md + initial_content: | + # termchart CLI (available on PATH) + + `termchart` pushes rich diagrams to an already-configured live viewer. + You do NOT need any URL or token — it is wired up for you. + + Push a React-Flow graph: + 1. Write a JSON file, e.g. `flow.json`: + {"nodes":[{"id":"n1","data":{"label":"Step one"}}, + {"id":"n2","data":{"label":"Step two"}}], + "edges":[{"id":"e1","source":"n1","target":"n2"}], + "direction":"TB"} + 2. Push it: + termchart push --project

--agent --type flow \ + --description "" flow.json --focus + 3. List what's on the viewer: termchart list + 4. Fetch a spec back: termchart pull --project

--agent + + Types: flow, component, vegalite, markdown, panes, mermaid. + verification_steps: [] + agent_generation: + requirements: [] + example_conversations: [] + prompt_template: "direct" + output_format: + type: direct + max_generation_turns: 1 + conversations: + - id: push_signup_flow + name: Push a user-signup flowchart + initial_prompt: > + You have a `termchart` CLI on PATH that pushes diagrams to an + already-configured live viewer — no tokens or setup needed. + + Task: render a TOP-DOWN flowchart of a simple user sign-up flow. + + 1. Write a JSON file `flow.json` describing a React-Flow graph with 4 + nodes and connecting edges: + "Enter email" -> "Verify email" -> "Create password" -> "Account created". + Shape: {"nodes":[{"id":"n1","data":{"label":"Enter email"}}, ...], + "edges":[{"id":"e1","source":"n1","target":"n2"}, ...], + "direction":"TB"} + 2. Push it: + termchart push --project termchart-compat --agent signup --type flow + --description "user signup flow" flow.json --focus + 3. Confirm it landed by running: termchart list + + Report the final `termchart list` output verbatim. If any termchart + command errors, show the exact error text. + simulant_goal: > + Get the agent to author a flow spec and push it to termchart, then + confirm via termchart list. Close as soon as it reports the list output + or a terminal error. + simulant_knowledge_system: + static_narrative_facts: + - The termchart CLI is on PATH and the viewer is pre-configured. + relevant_info_registry: [] + conversational_playbook: + - trigger_condition: The agent shows termchart list output or reports the push succeeded. + response: Thanks, that's all I needed. + description: Close the conversation once the board is pushed. + - trigger_condition: The agent reports a termchart command error. + response: OK, thanks — please stop here. + description: Close on a terminal error; the deterministic check decides pass/fail. + validation_objectives: + # PRIMARY, deterministic, trace-independent — works for ALL backends + # incl. antigravity (no trace). The viewer is cleared before each backend + # run, so any flow board present was pushed by THIS agent. + - type: command + command: 'termchart list 2>/dev/null | grep -qE "/signup[[:space:]].*\[flow\]"' + expected_exit_code: 0 + # SECONDARY, informational only (needs a tool trace; present for + # claude/gemini, absent for antigravity). required:false so it never + # fails a backend that lacks a trace. + - type: cli_command + commands: + - pattern: "termchart push" + required: false diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml new file mode 100644 index 0000000..18dfb0c --- /dev/null +++ b/scripts/experiments/agent-compat/definitions/cases/termchart_status_component.yaml @@ -0,0 +1,91 @@ +benchmarks: +- benchmark_type: interactive_simulation + id: TC-COMPONENT-001 + case_data: + name: "termchart — push a status dashboard component" + id: TC-COMPONENT-001 + description: > + Verify a coding-agent CLI can drive the termchart CLI to push a Mantine + `component` board (a small status dashboard) to the live viewer. Pass = a + component board actually lands (deterministic, trace-independent). + requirements: + - Use the termchart CLI to push a component diagram to the viewer. + complexity: + architecture: ARCH_SINGLE + mutation_risk: RISK_READ_ONLY + tool_depth: DEPTH_ATOMIC + environment: ENV_LOCAL_FILES + adk_api_tier: ADK_T1 + environment_config: + resources: + - type: file_system + name: agents_md + description: Cross-tool termchart CLI reference, auto-read by claude/gemini/agy. + path: AGENTS.md + initial_content: | + # termchart CLI (available on PATH) + + `termchart` pushes rich diagrams to an already-configured live viewer. + You do NOT need any URL or token — it is wired up for you. + + Push a Mantine component tree: + 1. Write a JSON file, e.g. `board.json`, whose top is a + {type, props, children} node, e.g.: + {"type":"Stack","props":{"gap":"md"}, + "children":[{"type":"Title","props":{"order":3},"children":"Status"}, + {"type":"Badge","props":{"color":"green"},"children":"Healthy"}]} + 2. Push it: + termchart push --project

--agent --type component \ + --description "" board.json --focus + 3. List what's on the viewer: termchart list + verification_steps: [] + agent_generation: + requirements: [] + example_conversations: [] + prompt_template: "direct" + output_format: + type: direct + max_generation_turns: 1 + conversations: + - id: push_status_component + name: Push a status dashboard component + initial_prompt: > + You have a `termchart` CLI on PATH that pushes diagrams to an + already-configured live viewer — no tokens or setup needed. + + Task: render a small service STATUS DASHBOARD as a termchart + `component` board. + + 1. Write a JSON file `board.json` with a Mantine {type, props, children} + tree: a Title "Service status", then two or three Badge components + (e.g. API "Healthy" green, DB "Degraded" yellow), wrapped in a Stack. + 2. Push it: + termchart push --project termchart-compat --agent status --type component + --description "service status dashboard" board.json --focus + 3. Confirm it landed by running: termchart list + + Report the final `termchart list` output verbatim. If any termchart + command errors, show the exact error text. + simulant_goal: > + Get the agent to author a component board and push it to termchart, then + confirm via termchart list. Close as soon as it reports the list output + or a terminal error. + simulant_knowledge_system: + static_narrative_facts: + - The termchart CLI is on PATH and the viewer is pre-configured. + relevant_info_registry: [] + conversational_playbook: + - trigger_condition: The agent shows termchart list output or reports the push succeeded. + response: Thanks, that's all I needed. + description: Close the conversation once the board is pushed. + - trigger_condition: The agent reports a termchart command error. + response: OK, thanks — please stop here. + description: Close on a terminal error; the deterministic check decides pass/fail. + validation_objectives: + - type: command + command: 'termchart list 2>/dev/null | grep -qE "/status[[:space:]].*\[component\]"' + expected_exit_code: 0 + - type: cli_command + commands: + - pattern: "termchart push" + required: false diff --git a/scripts/experiments/agent-compat/definitions/generator_sets.yaml b/scripts/experiments/agent-compat/definitions/generator_sets.yaml new file mode 100644 index 0000000..e483fa7 --- /dev/null +++ b/scripts/experiments/agent-compat/definitions/generator_sets.yaml @@ -0,0 +1,18 @@ +kind: generator_sets + +generator_sets: + tc-claude: + description: "Claude Code vs termchart" + generators: [tc_claude] + + tc-gemini: + description: "Gemini CLI vs termchart" + generators: [tc_gemini] + + tc-antigravity: + description: "Antigravity (agy) vs termchart" + generators: [tc_antigravity] + + termchart-agents: + description: "All three coding-agent CLIs vs termchart" + generators: [tc_claude, tc_gemini, tc_antigravity] diff --git a/scripts/experiments/agent-compat/definitions/generators.yaml b/scripts/experiments/agent-compat/definitions/generators.yaml new file mode 100644 index 0000000..3d87e63 --- /dev/null +++ b/scripts/experiments/agent-compat/definitions/generators.yaml @@ -0,0 +1,23 @@ +kind: generators + +# Three external coding-agent CLIs driven headless against termchart. +# backend values map to agent-generator's SimulatorBackend enum: +# claude -> `claude ... --dangerously-skip-permissions -p ""` +# gemini-cli -> `gemini --yolo ... -p ""` +# antigravity -> `antigravity -p ""` (our PATH wrapper execs `agy`) +generators: + tc_claude: + type: interactive + model: claude-haiku-4-5@20251001 + backend: claude + + tc_gemini: + type: interactive + model: gemini-3-flash-preview + backend: gemini-cli + + # antigravity harness ignores --model (base_cmd is hardcoded); value is a placeholder. + tc_antigravity: + type: interactive + model: default + backend: antigravity diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh new file mode 100755 index 0000000..820bc0e --- /dev/null +++ b/scripts/experiments/agent-compat/run.sh @@ -0,0 +1,163 @@ +#!/usr/bin/env bash +# agent-compat: run external coding-agent CLIs (Claude Code, Gemini CLI, Antigravity) +# against termchart via agent-generator's benchmark-runner, and verify each actually +# pushes a valid board to a local termchart viewer. +# +# Design (see README.md): +# * A local `termchart serve` viewer is started once; its URL+token are baked into a +# `termchart` PATH wrapper so agents run `termchart push` with no credential handling. +# (agent-generator's sandbox forwards PATH but NOT arbitrary env, so the wrapper — not +# env vars — is how the viewer config reaches the agent.) +# * `antigravity` is wrapped to exec `agy --dangerously-skip-permissions` (harness shells +# out to a binary literally named `antigravity`; agy's `-p` == `--print`). +# * The viewer is CLEARED before each backend so the deterministic "a board landed" check +# (a `command` validation objective) is unambiguous and trace-independent — the only way +# to fairly score Antigravity, whose harness produces no tool trace. +# +# Usage: +# run.sh [BACKEND_SET ...] # default: tc-claude tc-gemini tc-antigravity +# CASE_SET=termchart-compat run.sh tc-claude # single backend, all TC- cases +set -uo pipefail + +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +DEF_DIR="$SCRIPT_DIR/definitions" +AG_DIR="${AGENT_GENERATOR_DIR:-/home/ivanmkc/agent-generator}" +CASE_SET="${CASE_SET:-termchart-compat}" +BACKENDS=("$@"); [ ${#BACKENDS[@]} -eq 0 ] && BACKENDS=(tc-claude tc-gemini tc-antigravity) + +# NOTE: must live under $HOME, NOT /tmp — agent-generator's sandbox mounts a fresh +# tmpfs over /tmp, which would hide the PATH wrappers from the agent (it would fall +# through to a global `termchart` with no viewer configured). $HOME is ro-bound into +# the sandbox via `--ro-bind /`, so an absolute wrapper path there resolves. +mkdir -p "$HOME/.cache" +WORK="$(mktemp -d "$HOME/.cache/agent-compat.XXXXXX")" +BIN="$WORK/bin"; mkdir -p "$BIN" +SERVE_LOG="$WORK/serve.log" +SERVE_PID="" + +log(){ printf '\033[1;36m[agent-compat]\033[0m %s\n' "$*" >&2; } +err(){ printf '\033[1;31m[agent-compat] ERROR:\033[0m %s\n' "$*" >&2; } + +cleanup(){ [ -n "$SERVE_PID" ] && kill "$SERVE_PID" 2>/dev/null; rm -rf "$WORK" 2>/dev/null; } +trap cleanup EXIT + +# --- 0. locate tools ------------------------------------------------------- +# Use the repo's OWN built CLI, not a global install: `termchart serve` needs a +# co-located packages/viewer build (the published @ivanmkc/termchart-viewer is +# not on npm), so a global CLI can't start a viewer. +REPO_ROOT="$(cd "$SCRIPT_DIR/../../.." && pwd)" +REAL_TC="$REPO_ROOT/packages/cli/dist/cli.js" +if [ ! -f "$REAL_TC" ] || [ ! -f "$REPO_ROOT/packages/viewer/dist/server.js" ]; then + log "building termchart CLI + viewer (first run) ..." + ( cd "$REPO_ROOT" && npm install && \ + npm run build --workspace @ivanmkc/termchart-viewer && \ + npm run build --workspace @ivanmkc/termchart ) >/dev/null 2>&1 \ + || { err "failed to build termchart CLI/viewer in $REPO_ROOT"; exit 3; } +fi +[ -f "$REAL_TC" ] || { err "termchart CLI not built at $REAL_TC"; exit 3; } +NODE_BIN="$(dirname "$(command -v node || echo /usr/bin/node)")" + +if command -v benchmark-runner >/dev/null 2>&1; then + RUNNER=(benchmark-runner) +elif command -v uv >/dev/null 2>&1; then + RUNNER=(uv run --project "$AG_DIR" benchmark-runner) +else + err "benchmark-runner not found and uv unavailable. Install agent-generator: (cd $AG_DIR && uv tool install --prerelease allow .)" + exit 3 +fi +log "runner: ${RUNNER[*]}" + +for c in claude gemini agy; do + command -v "$c" >/dev/null 2>&1 || err "warning: '$c' not on PATH — its backend will fail to launch" +done + +# --- 1. start the local viewer -------------------------------------------- +log "starting termchart serve ..." +"$REAL_TC" serve >"$SERVE_LOG" 2>&1 & +SERVE_PID=$! +VURL=""; VTOK="" +for _ in $(seq 1 60); do + VURL="$(grep -oE 'TERMCHART_VIEWER_URL=[^[:space:]]+' "$SERVE_LOG" | head -1 | cut -d= -f2-)" + VTOK="$(grep -oE 'TERMCHART_VIEWER_TOKEN=[^[:space:]]+' "$SERVE_LOG" | head -1 | cut -d= -f2-)" + [ -n "$VURL" ] && [ -n "$VTOK" ] && break + kill -0 "$SERVE_PID" 2>/dev/null || { err "termchart serve exited early:"; cat "$SERVE_LOG" >&2; exit 3; } + sleep 1 +done +[ -z "$VURL" ] && { err "timed out waiting for viewer URL. serve log:"; cat "$SERVE_LOG" >&2; exit 3; } +log "viewer ready: $VURL" + +# --- 2. PATH wrappers ------------------------------------------------------ +# termchart: inject viewer URL/token (the sandbox does not forward these env vars). +cat > "$BIN/termchart" < "$BIN/antigravity" </dev/null 2>&1 || return 1 + local out; out="$(HOME="$(mktemp -d)" timeout 45 agy --dangerously-skip-permissions -p "reply OK" 2>&1)" + ! grep -qiE 'authentication failed|not authenticated|please (log|sign) in|timed out' <<<"$out" +} + +# --- 4. per-backend loop (clear viewer -> run -> record) ------------------- +declare -A RESULT +for b in "${BACKENDS[@]}"; do + log "=== backend: $b ===" + if [ "$b" = "tc-antigravity" ] && ! agy_usable_in_sandbox; then + RESULT[$b]="SKIP (agy auth not available under sandbox HOME=/tmp — see README)" + log "backend $b -> ${RESULT[$b]}" + continue + fi + "$BIN/termchart" clear --all >/dev/null 2>&1 || log "warn: viewer clear failed (continuing)" + if "${RUNNER[@]}" \ + --config-dir "$DEF_DIR" \ + --case-set "$CASE_SET" \ + --generator-set "$b" \ + --name "compat-$b" \ + --user agent-compat \ + --concurrency 1 \ + --require-all-pass; then + RESULT[$b]="PASS" + else + RESULT[$b]="FAIL (exit $?)" + fi + log "backend $b -> ${RESULT[$b]}" +done + +# --- 5. summary ------------------------------------------------------------ +echo +echo "================ termchart agent-compat summary ================" +printf '%-16s %s\n' "BACKEND" "RESULT" +rc=0 +for b in "${BACKENDS[@]}"; do + printf '%-16s %s\n' "$b" "${RESULT[$b]}" + [[ "${RESULT[$b]}" == FAIL* ]] && rc=1 # SKIP does not fail the run +done +echo "case-set: $CASE_SET viewer: $VURL" +echo "===============================================================" +exit $rc diff --git a/scripts/experiments/agent-compat/run_locally.sh b/scripts/experiments/agent-compat/run_locally.sh new file mode 100755 index 0000000..54bacbd --- /dev/null +++ b/scripts/experiments/agent-compat/run_locally.sh @@ -0,0 +1,37 @@ +#!/usr/bin/env bash +# Run the agent-compat workflow locally via `act` + podman (mirrors +# agent-generator/.github/scripts/run_eval_locally.sh). Bind-mounts the host's +# ADC and the agy (Antigravity) binary, which are not available in a fresh runner. +# +# Prereqs: act (>=0.2.87), podman (rootless socket active), gcloud ADC, and the +# host CLIs (claude, gemini, agy, termchart) — or let the workflow npm-install the +# public ones. Simpler alternative for iterating: run run.sh directly (no act). +set -euo pipefail + +REPO_ROOT="$(cd "$(dirname "${BASH_SOURCE[0]}")/../../.." && pwd)" +ADC="$HOME/.config/gcloud/application_default_credentials.json" +BACKENDS="${1:-tc-claude tc-gemini tc-antigravity}" + +[ -f "$ADC" ] || { echo "Missing ADC at $ADC — run: gcloud auth application-default login"; exit 1; } + +# Point act/docker at the rootless podman socket if DOCKER_HOST is unset. +export DOCKER_HOST="${DOCKER_HOST:-unix:///run/user/$(id -u)/podman/podman.sock}" + +EVENT="$(mktemp)" +cat > "$EVENT" < Date: Sat, 4 Jul 2026 01:56:16 +0000 Subject: [PATCH 2/7] =?UTF-8?q?agent-compat:=20re-run=20vs=20merged=20#220?= =?UTF-8?q?=20--type=20validation=20=E2=80=94=20feedback=20loop=20works,?= =?UTF-8?q?=20journey=20selection=20still=20the=20gap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit #219 (push --type validation) merged into master. Re-ran journeys: agents now get an actionable error on an invalid --type and retry with a VALID type every time (no more silently-stored unrenderable boards — finding #2 fixed). Scores stay ~1/5 because agents retry to a valid-but-wrong type (Claude falls back to mermaid); picking the RIGHT journey is a recipe-activation gap, not validation. --- scripts/experiments/agent-compat/RESULTS.md | 33 +++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md index 17244a3..358fe05 100644 --- a/scripts/experiments/agent-compat/RESULTS.md +++ b/scripts/experiments/agent-compat/RESULTS.md @@ -67,3 +67,36 @@ valid types and said "do not invent names" — behaviour did not change.) `get_sandbox_auth`) to score Antigravity. - Optionally mount the termchart plugin into Claude's sandbox to test its real-world (plugin-equipped) behaviour vs the AGENTS.md-only condition measured here. + +## Re-run against #219 (merged 2026-07-04) — `push` now validates `--type` + +PR #219 ("close the push→display validation gap") landed the fix recommended above: +`validateContent` now rejects unknown board types at push time (CLI offline fast-fail + +server 400) with the supported-type list. Re-ran the journey suite on merged master. + +**Scores:** Claude 1/5, Gemini 1/5 (metrics passes both; ±1 run-to-run variance — agents +are stochastic at K=1). Journey-activation scores did not jump — but the **failure mode +changed for the better**, which is the point of #219. + +**The feedback loop works (Claude transcripts):** every invalid `--type` now returns +`unknown type "X" — Supported types: …`, and Claude **retries with a valid type every +time** instead of silently storing an unrenderable board: + +| scenario | attempts | outcome | +|---|---|---| +| compare → component | `comparison` → ERR → `vegalite` | valid, wrong journey | +| architecture → flow | `architecture` → ERR → `flow` ✅ → `mermaid` | got flow, then overwrote | +| metrics → vegalite | `line` → ERR → `vegalite` | ✅ correct | +| ER → flow | `sql` → ERR → `mermaid` | valid, wrong journey | +| dashboard → panes | `graph` → ERR → `mermaid` | valid, wrong journey | + +**Takeaways:** +- **#219 fixes finding #2 (silent invalid-type acceptance).** No unrenderable boards land + anymore; agents get actionable feedback and self-correct to a *valid* type. +- **The remaining gap is journey selection, not validation.** Given the valid set, agents + often pick a valid-but-wrong type — Claude repeatedly falls back to `mermaid` (the + lowest-common-denominator) rather than the journey's type (component/flow/panes). That's + a recipe-activation problem: the error lists valid types but not which fits the intent. +- **Actionable next step:** have the reject message (or AGENTS.md/plugin) map intent→type + ("comparisons → component", "schema/architecture → flow"), so the self-correction lands + on the *right* journey, not just a renderable one. Then re-run. From b2bae511a7952b3cf3aaf2dfcaf9741a04f1925b Mon Sep 17 00:00:00 2001 From: Ivan Cheung Date: Sat, 4 Jul 2026 02:08:17 +0000 Subject: [PATCH 3/7] =?UTF-8?q?agent-compat:=20intent-hint=20rerun=20?= =?UTF-8?q?=E2=80=94=20journey=20activation=200/5->4/5=20(Claude),=202/5->?= =?UTF-8?q?3/5=20(Gemini)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The #229 did-you-mean/intent hint closes the journey-selection gap: agents now self-correct to the RIGHT type (comparison->component, architecture->flow, etc.), not just a valid one. Only ER residual (agents prefer mermaid's native erDiagram). --- scripts/experiments/agent-compat/RESULTS.md | 26 +++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md index 358fe05..cda20a2 100644 --- a/scripts/experiments/agent-compat/RESULTS.md +++ b/scripts/experiments/agent-compat/RESULTS.md @@ -100,3 +100,29 @@ time** instead of silently storing an unrenderable board: - **Actionable next step:** have the reject message (or AGENTS.md/plugin) map intent→type ("comparisons → component", "schema/architecture → flow"), so the self-correction lands on the *right* journey, not just a renderable one. Then re-run. + +## Re-run with the intent→type hint (PR #229) — the journey-selection gap closes + +#219 made agents self-correct to a *valid* type; the residual was they picked a valid-but- +**wrong** type (Claude falling back to `mermaid`). PR #229 adds the missing nudge: a +`Did you mean "component"?` + one-line intent→type guide in the reject message, and a +"Push --type" block in `termchart --help` listing the rich types. Re-ran the journey suite: + +| backend | before #219 | with #219 | **with #229 hint** | +|---|---|---|---| +| Claude Code | 0/5 | 1/5 | **4/5** | +| Gemini CLI | 2/5 | 1/5 | **3/5** | + +Per-case (with #229): compare→component, architecture→flow, metrics→vegalite, dashboard→ +panes/component all **PASS** for Claude (Gemini passes arch/metrics/dashboard). The +feedback loop now lands on the *right* journey: e.g. Claude `comparison` → ERR("did you +mean component") → **component** ✅. + +**Remaining residual — ER (both fail):** for a DB schema, agents reach for `--type mermaid` +(Mermaid has a native `erDiagram`), which is a *valid* type → no error → no hint → lands as +`mermaid` instead of the rich `flow` entity-node journey. Options if we want ER too: teach +the guide to prefer `flow` for schemas, or accept a mermaid erDiagram as a valid ER journey. + +**Net:** the harness drove three concrete termchart fixes — #219 (reject invalid types), +#229 (guide to the right type) — taking journey activation from 0/5 to 4/5 (Claude). The +tests live here (`definitions/` + `run.sh`); the product fixes are separate PRs. From 7bc848c4d7b53dbda60e755db970600fc011bdd2 Mon Sep 17 00:00:00 2001 From: Ivan Cheung Date: Sat, 4 Jul 2026 03:34:26 +0000 Subject: [PATCH 4/7] agent-compat: install termchart's bundled skills into the sandbox via npx skills MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Agents previously only had the seeded AGENTS.md — termchart's real skills (diagram-recipes etc.) ship as a plugin under ~/.claude/plugins, which the simulator sandbox does NOT mount. Install them with the Vercel `npx skills add plugin/skills -g --all` CLI (same mechanism agents-cli setup uses) so they land in the canonical ~/.agents/skills store with the per-agent symlinks the sandbox mounts. Workflow also runs `agents-cli setup --skip-auth` for the general bundle. --- .github/workflows/agent-compat.yml | 12 ++++++++++++ scripts/experiments/agent-compat/run.sh | 22 ++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/.github/workflows/agent-compat.yml b/.github/workflows/agent-compat.yml index f19d0c8..be524b2 100644 --- a/.github/workflows/agent-compat.yml +++ b/.github/workflows/agent-compat.yml @@ -53,6 +53,18 @@ jobs: fi echo "$HOME/.local/bin" >> "$GITHUB_PATH" + - name: Install skills into the coding agents (agents-cli setup) + run: | + # Installs skills into ~/.claude/skills, ~/.gemini/extensions, ~/.agents/skills — + # the locations the simulator sandbox mounts. termchart's own skills are then + # installed on top by run.sh (from plugin/skills/), since termchart ships them as + # a plugin that doesn't land in those dirs. + if command -v agents-cli >/dev/null 2>&1; then + agents-cli setup --skip-auth || echo "::warning::agents-cli setup failed (continuing; run.sh still installs termchart skills)" + else + echo "::warning::agents-cli not found; skipping general skill setup (run.sh still installs termchart skills)" + fi + # Hosted CI: authenticate to GCP via WIF (skipped under act — uses mounted ADC). - name: GCP auth (CI only) if: ${{ !env.ACT }} diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh index 820bc0e..bc473b4 100755 --- a/scripts/experiments/agent-compat/run.sh +++ b/scripts/experiments/agent-compat/run.sh @@ -115,6 +115,28 @@ export GOOGLE_GENAI_USE_VERTEXAI="${GOOGLE_GENAI_USE_VERTEXAI:-true}" export GOOGLE_CLOUD_PROJECT="${GOOGLE_CLOUD_PROJECT:-adk-coding-agents}" export GOOGLE_CLOUD_LOCATION="${GOOGLE_CLOUD_LOCATION:-global}" +# --- 3b. install termchart's bundled skills into the sandbox-mounted skill stores ---- +# termchart ships its skills (diagram-recipes etc.) as a plugin, which lands in +# ~/.claude/plugins — a path the agent-generator sandbox does NOT mount. The sandbox +# DOES mount ~/.agents/skills (shared, gemini) and ~/.claude/skills (claude), so install +# the plugin skills there and the agents get termchart's REAL recipe knowledge in-sandbox +# (not just the seeded AGENTS.md). Mirrors how `agents-cli setup` installs ADK skills. +install_termchart_skills(){ + local src="$REPO_ROOT/plugin/skills" + [ -d "$src" ] || { log "warn: no plugin/skills at $src — agents rely on AGENTS.md only"; return; } + # Install via the Vercel `npx skills` CLI (the same mechanism `agents-cli setup` uses): it + # places skills in the canonical global store (~/.agents/skills) with per-agent symlinks + # (~/.claude/skills, ~/.gemini/extensions) — exactly the dirs the sandbox mounts (it mounts + # BOTH the store and the per-agent dir, so the relative symlinks resolve in-sandbox). + npx -y skills add "$src" -g --all >/dev/null 2>&1 || true # nonzero for unrelated agents (Eve/…) + if [ -d "$HOME/.agents/skills/diagram-recipes" ]; then + log "installed termchart skills via npx skills: $(ls "$src" | tr '\n' ' ')" + else + log "warn: termchart skills not installed (npx skills add failed) — agents fall back to AGENTS.md" + fi +} +install_termchart_skills + # Preflight: agy authenticates via a login in $HOME, but the sandbox forces HOME=/tmp, # so agy can't auth inside a case (Claude/Gemini forward creds via env; agy doesn't). # Detect this up front and SKIP tc-antigravity rather than report a misleading FAIL. From c847218ecaa0aa6c5bf782afcbf4d375b51dcdb7 Mon Sep 17 00:00:00 2001 From: Ivan Cheung Date: Sat, 4 Jul 2026 03:38:04 +0000 Subject: [PATCH 5/7] agent-compat: real skills installed (npx skills) but agents don't auto-activate them MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Installing termchart's skills into the sandbox (diagram-recipes etc.) did NOT improve journey activation (Claude 1/5, Gemini 2/5) — transcripts show Claude never read the skill (read-skill=False) and kept using Mermaid keywords. Passive availability != activation. The #229 in-error did-you-mean hint (4/5) remains the effective lever because it lands where the agent is already looking. --- scripts/experiments/agent-compat/RESULTS.md | 28 +++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md index cda20a2..2c0dc9e 100644 --- a/scripts/experiments/agent-compat/RESULTS.md +++ b/scripts/experiments/agent-compat/RESULTS.md @@ -126,3 +126,31 @@ the guide to prefer `flow` for schemas, or accept a mermaid erDiagram as a valid **Net:** the harness drove three concrete termchart fixes — #219 (reject invalid types), #229 (guide to the right type) — taking journey activation from 0/5 to 4/5 (Claude). The tests live here (`definitions/` + `run.sh`); the product fixes are separate PRs. + +## Re-run with termchart's real skills installed in the sandbox (npx skills) + +run.sh now installs termchart's bundled skills (diagram-recipes, inbox-watch, termchart) +into the sandbox-mounted stores via `npx skills add plugin/skills -g --all` (the Vercel +skills CLI, same mechanism `agents-cli setup` uses) — so agents have termchart's REAL +recipe knowledge in-sandbox, not just the seeded AGENTS.md. Tested on master (#219's reject, +**without** the separate #229 hint) to isolate the skills' effect. + +| config | Claude | Gemini | +|---|---|---| +| #219 only (no skills, no hint) | 1/5 | 1/5 | +| **#219 + real skills installed (no hint)** | **1/5** | **2/5** | +| #219 + #229 hint (no skills) | 4/5 | 3/5 | + +**Finding: passively installing the skill does NOT improve journey activation.** The +transcripts show Claude never opened `diagram-recipes` (`read-skill=False`) and kept using +Mermaid keywords (`flowchart`, `erDiagram`, `mermaid`). Headless agents don't spontaneously +activate a mounted skill from a plain scenario prompt (the agent-generator skill-activation +cases only get activation when the prompt explicitly says "check what skills are available +and use X"). So availability ≠ activation. + +**Takeaway:** in-the-moment feedback at the point of failure (#229's did-you-mean in the +reject message, 4/5) beats passively-available skills (1–2/5) for steering agents to the +right journey — because the hint lands in the tool output the agent is already reading. +The skills install is still worth keeping (it's how a faithful environment is set up, and +it helps when a prompt/slash-command does invoke the skill), but it is not the lever that +closes the gap. From f584de2ad8428ef2b1ec5ce90eeb2a133cff3a47 Mon Sep 17 00:00:00 2001 From: Ivan Cheung Date: Sat, 4 Jul 2026 03:55:42 +0000 Subject: [PATCH 6/7] =?UTF-8?q?agent-compat:=20correct=20finding=20?= =?UTF-8?q?=E2=80=94=20skills=20activate,=20but=20the=20TERMINAL=20termcha?= =?UTF-8?q?rt=20skill=20mis-activates?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Traces show Claude DOES call the Skill tool; it activates the 'termchart' terminal (Mermaid->ASCII) skill, not diagram-recipes, and gets anchored on Mermaid types. Removing it and leaving only diagram-recipes still drifts to mermaid because the Mermaid-terminal identity pervades termchart (CLI --help tagline etc.). run.sh now installs only the viewer skill; the durable fix is deprecating the terminal surface. --- scripts/experiments/agent-compat/RESULTS.md | 18 ++++++++++++++++++ scripts/experiments/agent-compat/run.sh | 18 +++++++++++++++--- 2 files changed, 33 insertions(+), 3 deletions(-) diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md index 2c0dc9e..e8351aa 100644 --- a/scripts/experiments/agent-compat/RESULTS.md +++ b/scripts/experiments/agent-compat/RESULTS.md @@ -154,3 +154,21 @@ right journey — because the hint lands in the tool output the agent is already The skills install is still worth keeping (it's how a faithful environment is set up, and it helps when a prompt/slash-command does invoke the skill), but it is not the lever that closes the gap. + +## Correction: skills DO activate — the TERMINAL skill mis-activates (root cause) + +My earlier "read-skill=False" was a detection bug. Traces show Claude calls the `Skill` +tool on every case. But it activates the **`termchart` terminal skill** ("render Mermaid → +ASCII"), not `diagram-recipes` (the viewer-push journey guide) — the literal name "termchart" +wins for "the termchart viewer". That skill anchors Claude on Mermaid types (`flowchart`, +`erDiagram`, `line`), so journey activation stays ~1/5. + +Removing the terminal skill and installing ONLY `diagram-recipes` did NOT fix it either +(Claude still drifted to `graph`/`erd`/`mermaid`): the Mermaid-terminal identity is baked +into termchart itself — the CLI `--help` tagline is literally "deterministic Mermaid → ASCII +for terminals". So the whole terminal framing keeps pulling agents toward Mermaid. + +**Conclusion:** the durable fix isn't a hint or a skill tweak — it's to **deprecate +termchart's terminal (Mermaid→ASCII) surface and present it as a viewer-first tool**, so the +only vocabulary agents ever see is the rich viewer types. (The #229 in-error hint remains a +good belt-and-suspenders correction: 4/5.) diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh index bc473b4..c7d5d08 100755 --- a/scripts/experiments/agent-compat/run.sh +++ b/scripts/experiments/agent-compat/run.sh @@ -128,11 +128,23 @@ install_termchart_skills(){ # places skills in the canonical global store (~/.agents/skills) with per-agent symlinks # (~/.claude/skills, ~/.gemini/extensions) — exactly the dirs the sandbox mounts (it mounts # BOTH the store and the per-agent dir, so the relative symlinks resolve in-sandbox). - npx -y skills add "$src" -g --all >/dev/null 2>&1 || true # nonzero for unrelated agents (Eve/…) + # + # Install ONLY the VIEWER-PUSH skill (diagram-recipes). The bundled `termchart` skill is for + # TERMINAL ASCII/Mermaid rendering — when present, agents activate it for "show on the viewer" + # tasks (it's the literal name match) and get anchored on Mermaid diagram types (flowchart / + # erDiagram), which is the wrong vocabulary for a rich viewer push. Remove it if a prior run + # (or the bundle) installed it, so diagram-recipes is the skill that activates. + npx -y skills remove termchart -g -y >/dev/null 2>&1 || true + rm -rf "$HOME/.agents/skills/termchart" "$HOME/.claude/skills/termchart" "$HOME/.gemini/extensions/termchart" 2>/dev/null + local want=(diagram-recipes inbox-watch) + for s in "${want[@]}"; do + [ -f "$src/$s/SKILL.md" ] || continue + npx -y skills add "$src/$s" -g --all >/dev/null 2>&1 || true # nonzero for unrelated agents (Eve/…) + done if [ -d "$HOME/.agents/skills/diagram-recipes" ]; then - log "installed termchart skills via npx skills: $(ls "$src" | tr '\n' ' ')" + log "installed termchart viewer skill(s) via npx skills: ${want[*]} (terminal 'termchart' skill excluded)" else - log "warn: termchart skills not installed (npx skills add failed) — agents fall back to AGENTS.md" + log "warn: diagram-recipes not installed (npx skills add failed) — agents fall back to AGENTS.md" fi } install_termchart_skills From db7f86e7521e27c4e864614aa5267283d4c69315 Mon Sep 17 00:00:00 2001 From: Ivan Cheung Date: Sat, 4 Jul 2026 08:05:33 +0000 Subject: [PATCH 7/7] agent-compat: agents one-shot 5/5 once the guide reaches them (CLAUDE.md) + self-contained shapes Root cause of Claude's low journey scores was NOT skill quality/model: Claude Code reads CLAUDE.md, but the harness seeded the guide only as AGENTS.md (Gemini's file). Claude ran blind and fell back to mermaid 'graph LR' under --type flow. Fixes: seed CLAUDE.md too + make the guide self-contained (exact per-type JSON + 'flow content is JSON not mermaid'). Now one-shot with NO error hint: Claude haiku 5/5, Claude sonnet 5/5, Gemini 5/5 (was 1/5, ~1/5, 2/5). Adds tc_claude_sonnet generator + SKIP_SKILL_INSTALL for A/B. --- scripts/experiments/agent-compat/RESULTS.md | 28 +++++++++++++++ .../definitions/cases/termchart_journeys.yaml | 35 ++++++++++++++----- .../definitions/generator_sets.yaml | 4 +++ .../agent-compat/definitions/generators.yaml | 6 ++++ scripts/experiments/agent-compat/run.sh | 3 ++ 5 files changed, 67 insertions(+), 9 deletions(-) diff --git a/scripts/experiments/agent-compat/RESULTS.md b/scripts/experiments/agent-compat/RESULTS.md index e8351aa..5b6787f 100644 --- a/scripts/experiments/agent-compat/RESULTS.md +++ b/scripts/experiments/agent-compat/RESULTS.md @@ -172,3 +172,31 @@ for terminals". So the whole terminal framing keeps pulling agents toward Mermai termchart's terminal (Mermaid→ASCII) surface and present it as a viewer-first tool**, so the only vocabulary agents ever see is the rich viewer types. (The #229 in-error hint remains a good belt-and-suspenders correction: 4/5.) + +## One-shot breakthrough — the guide wasn't reaching Claude (AGENTS.md vs CLAUDE.md) + +The real root cause of Claude's low scores was NOT skill quality or the model: **Claude Code +reads `CLAUDE.md`, but the harness seeded the guide only as `AGENTS.md`** (which Gemini reads). +Proof: none of the guide's distinctive strings appeared in Claude's trace. So Claude ran blind +and fell back to its "flowchart = mermaid `graph LR`" prior — picking `--type flow` correctly +but writing Mermaid content under it (`invalid flow JSON: Unexpected token 'g', "graph LR"`). + +Two fixes: +1. Seed the guide as **`CLAUDE.md`** too (not just `AGENTS.md`). +2. Make the guide **self-contained**: exact copy-pasteable JSON per type + explicit + "`--type flow`/`component`/`vegalite` content is JSON, never Mermaid `graph` syntax." + +Result — **one-shot, no error hint, terminal skill removed**: + +| backend | before (AGENTS.md only) | **after (CLAUDE.md + self-contained shapes)** | +|---|---|---| +| Claude Haiku | 1/5 | **5/5** | +| Claude Sonnet | ~1/5 | **5/5** | +| Gemini Flash | 2/5 | **5/5** | + +**Takeaway:** the skill genuinely IS the hint — a good, self-contained guide that actually +reaches the agent makes it one-shot the right journey with zero error-correction. The #229 +reactive hint is now redundant when the guide lands. Two lessons for the product: (a) termchart's +cross-tool guidance must reach each agent's native memory file (`CLAUDE.md` for Claude, +`AGENTS.md`/`GEMINI.md` for others), and (b) the diagram-recipes skill should carry the exact +per-type JSON inline so an agent never has to open a second file (and never bails to mermaid). diff --git a/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml index 4b92845..64e1e77 100644 --- a/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml +++ b/scripts/experiments/agent-compat/definitions/cases/termchart_journeys.yaml @@ -35,6 +35,12 @@ benchmarks: Do NOT invent other names (NOT "comparison", "erd", "architecture", "line", "graph", "chart"). Map your intent to one of the valid types below. + CRITICAL: the content for flow / component / vegalite / panes is **JSON** (see the + shapes below). Do NOT write Mermaid syntax (`graph TB`, `sequenceDiagram`, `erDiagram`) + under those types — that is a JSON parse error. Mermaid syntax ONLY goes with + `--type mermaid`, which is a last resort. Always run `termchart push …` (with the + `push` subcommand and `--description`), never a bare `termchart --project …`. + Choose the type by intent: - Compare options / products / "which should I pick" -> component (cards; each option an Image + Title + outbound Anchor link + key specs) @@ -46,11 +52,18 @@ benchmarks: (grid of component/vegalite tiles) — or one component with stat cards - Notes / write-up / explanation -> markdown - Shapes: - - flow: {"nodes":[{"id":"n1","data":{"label":"..."}}],"edges":[{"id":"e1","source":"n1","target":"n2"}],"direction":"TB"} - - component: {"type":"Stack","children":[{"type":"Title","children":"..."},{"type":"Badge","props":{"color":"green"},"children":"OK"}]} - - vegalite: {"title":"...","data":{"values":[...]},"mark":"line","encoding":{"x":{...},"y":{...}}} - - panes: {"layout":"grid","panes":[{"title":"...","type":"vegalite","content":{...}}]} + Author the JSON directly from these COMPLETE shapes. Do NOT switch to + --type mermaid because you're unsure of a format — the format is right here: + - flow (architecture / request flow / ER / state / sequence): nodes + edges, "direction":"TB" + {"nodes":[{"id":"lb","data":{"label":"Load balancer"}},{"id":"web","data":{"label":"Web servers"}},{"id":"db","data":{"label":"Postgres"}},{"id":"cache","data":{"label":"Redis"}}],"edges":[{"id":"e1","source":"lb","target":"web"},{"id":"e2","source":"web","target":"db"},{"id":"e3","source":"web","target":"cache"}],"direction":"TB"} + (for an ER/class schema give each node "type":"entity" with a "fields" list) + - component (comparison / status board): a Mantine {type,props,children} tree + {"type":"Stack","children":[{"type":"Title","props":{"order":3},"children":"Compare"},{"type":"SimpleGrid","props":{"cols":3},"children":[{"type":"Card","children":[{"type":"Text","props":{"fw":700},"children":"Option A"},{"type":"Text","children":"key specs"}]}]}]} + - vegalite (metric / trend / chart): + {"title":"Monthly active users","data":{"values":[{"m":"Jan","u":1000},{"m":"Feb","u":2000},{"m":"Mar","u":4000}]},"mark":"line","encoding":{"x":{"field":"m","type":"ordinal"},"y":{"field":"u","type":"quantitative"}}} + - panes (dashboard of tiles; each pane's "content" is a JSON STRING): + {"layout":"grid","panes":[{"title":"Latency","type":"vegalite","content":"{\"mark\":\"line\",\"data\":{\"values\":[{\"t\":1,\"ms\":90}]},\"encoding\":{\"x\":{\"field\":\"t\"},\"y\":{\"field\":\"ms\"}}}"},{"title":"Health","type":"component","content":"{\"type\":\"Badge\",\"props\":{\"color\":\"green\"},\"children\":\"OK\"}"}]} + - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide } verification_steps: [] agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } conversations: @@ -93,7 +106,8 @@ benchmarks: complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } environment_config: resources: - - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide } verification_steps: [] agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } conversations: @@ -124,7 +138,8 @@ benchmarks: complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } environment_config: resources: - - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide } verification_steps: [] agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } conversations: @@ -154,7 +169,8 @@ benchmarks: complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } environment_config: resources: - - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide } verification_steps: [] agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } conversations: @@ -185,7 +201,8 @@ benchmarks: complexity: { architecture: ARCH_SINGLE, mutation_risk: RISK_READ_ONLY, tool_depth: DEPTH_ATOMIC, environment: ENV_LOCAL_FILES, adk_api_tier: ADK_T1 } environment_config: resources: - - { type: file_system, name: agents_md, description: termchart decision guide, path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: agents_md, description: termchart decision guide (Gemini/agy read AGENTS.md), path: AGENTS.md, initial_content: *guide } + - { type: file_system, name: claude_md, description: same guide for Claude Code (reads CLAUDE.md), path: CLAUDE.md, initial_content: *guide } verification_steps: [] agent_generation: { requirements: [], example_conversations: [], prompt_template: "direct", output_format: { type: direct }, max_generation_turns: 1 } conversations: diff --git a/scripts/experiments/agent-compat/definitions/generator_sets.yaml b/scripts/experiments/agent-compat/definitions/generator_sets.yaml index e483fa7..735e1e4 100644 --- a/scripts/experiments/agent-compat/definitions/generator_sets.yaml +++ b/scripts/experiments/agent-compat/definitions/generator_sets.yaml @@ -16,3 +16,7 @@ generator_sets: termchart-agents: description: "All three coding-agent CLIs vs termchart" generators: [tc_claude, tc_gemini, tc_antigravity] + + tc-claude-sonnet: + description: "Claude Code (Sonnet — capable model) vs termchart" + generators: [tc_claude_sonnet] diff --git a/scripts/experiments/agent-compat/definitions/generators.yaml b/scripts/experiments/agent-compat/definitions/generators.yaml index 3d87e63..28d8726 100644 --- a/scripts/experiments/agent-compat/definitions/generators.yaml +++ b/scripts/experiments/agent-compat/definitions/generators.yaml @@ -11,6 +11,12 @@ generators: model: claude-haiku-4-5@20251001 backend: claude + # Capable-model Claude (what real termchart users run — not the cheapest tier). + tc_claude_sonnet: + type: interactive + model: claude-sonnet-4-6 + backend: claude + tc_gemini: type: interactive model: gemini-3-flash-preview diff --git a/scripts/experiments/agent-compat/run.sh b/scripts/experiments/agent-compat/run.sh index c7d5d08..c13dd6d 100755 --- a/scripts/experiments/agent-compat/run.sh +++ b/scripts/experiments/agent-compat/run.sh @@ -122,6 +122,9 @@ export GOOGLE_CLOUD_LOCATION="${GOOGLE_CLOUD_LOCATION:-global}" # the plugin skills there and the agents get termchart's REAL recipe knowledge in-sandbox # (not just the seeded AGENTS.md). Mirrors how `agents-cli setup` installs ADK skills. install_termchart_skills(){ + # Set SKIP_SKILL_INSTALL=1 to run against a pre-installed skill store (for A/B experiments + # where the bundle is set up externally). + [ "${SKIP_SKILL_INSTALL:-}" = 1 ] && { log "SKIP_SKILL_INSTALL=1 — using pre-installed skill store"; return; } local src="$REPO_ROOT/plugin/skills" [ -d "$src" ] || { log "warn: no plugin/skills at $src — agents rely on AGENTS.md only"; return; } # Install via the Vercel `npx skills` CLI (the same mechanism `agents-cli setup` uses): it