From c43b8b92417eadd0a87a5ca5436d17473ee3fadc Mon Sep 17 00:00:00 2001 From: "donghe.kilov" Date: Thu, 4 Jun 2026 20:28:54 +0800 Subject: [PATCH] feat: add llms.txt support Add canonical llms.txt and llms-full.txt files for AI assistant consumption. Publish synchronized copies through docs/public so the VitePress site serves both files at the site root. Add llms:build and llms:check scripts plus a GitHub Actions guard to keep generated copies in sync. Change-Id: I60d48a27634df5ac783b882022ed81c78e9e7af6 Verification: npm run llms:check && npm run docs:build. --- .github/workflows/llms-txt-check.yml | 24 ++++ README.md | 20 +++ docs/public/llms-full.txt | 182 +++++++++++++++++++++++++++ docs/public/llms.txt | 75 +++++++++++ llms-full.txt | 182 +++++++++++++++++++++++++++ llms.txt | 75 +++++++++++ package.json | 6 +- scripts/build-llms-txt.mjs | 77 ++++++++++++ 8 files changed, 639 insertions(+), 2 deletions(-) create mode 100644 .github/workflows/llms-txt-check.yml create mode 100644 docs/public/llms-full.txt create mode 100644 docs/public/llms.txt create mode 100644 llms-full.txt create mode 100644 llms.txt create mode 100755 scripts/build-llms-txt.mjs diff --git a/.github/workflows/llms-txt-check.yml b/.github/workflows/llms-txt-check.yml new file mode 100644 index 00000000..487bddd0 --- /dev/null +++ b/.github/workflows/llms-txt-check.yml @@ -0,0 +1,24 @@ +name: llms.txt sync check + +on: + pull_request: + paths: + - 'llms.txt' + - 'llms-full.txt' + - 'docs/public/llms.txt' + - 'docs/public/llms-full.txt' + - 'scripts/build-llms-txt.mjs' + - '.github/workflows/llms-txt-check.yml' + push: + branches: [main] + +jobs: + check: + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + - uses: actions/setup-node@v4 + with: + node-version: '20' + - name: Verify /llms.txt and docs/public/llms.txt are in sync + run: node scripts/build-llms-txt.mjs --check diff --git a/README.md b/README.md index 0f8d0c8f..45c3db4c 100644 --- a/README.md +++ b/README.md @@ -568,6 +568,26 @@ This repository also includes reusable AI agent skills that you can install dire --- +## For AI assistants (`llms.txt`) + +This repository ships an [`llms.txt`](./llms.txt) at its root, following the +[llmstxt.org](https://llmstxt.org) specification, plus an expanded +[`llms-full.txt`](./llms-full.txt) bundle. Both files are also published at the +site root of the documentation site: + +- https://walkinglabs.github.io/learn-harness-engineering/llms.txt +- https://walkinglabs.github.io/learn-harness-engineering/llms-full.txt + +Point an LLM-aware tool at either URL to give it a curated index (or full +context bundle) of the course without having to crawl the whole site. + +The site copies are kept in sync with the repo-root files by +`npm run llms:build`, which runs automatically as part of `npm run docs:build`. +CI (`.github/workflows/llms-txt-check.yml`) enforces that the two copies stay +identical. + +--- + ## Other Courses Our team has also created other courses! Check them out: diff --git a/docs/public/llms-full.txt b/docs/public/llms-full.txt new file mode 100644 index 00000000..00dbe699 --- /dev/null +++ b/docs/public/llms-full.txt @@ -0,0 +1,182 @@ +# Learn Harness Engineering — Expanded Context Bundle + +> This is the long-form companion to `/llms.txt`. It inlines the project README and concise per-lecture summaries so an LLM can answer questions about the course in a single pass without dereferencing every link. URLs preserved from the index let you fetch the full source on demand. + +--- + +## About this bundle + +- Source repository: https://github.com/walkinglabs/learn-harness-engineering +- Documentation site: https://walkinglabs.github.io/learn-harness-engineering/ +- License & contribution policy: see the repository root. +- Status: course is actively evolving (v0.1.x). Lectures 09 and 10 are tracked in issue #2 — content may still be in flight at read time. + +--- + +## Repository structure + +``` +learn-harness-engineering/ +├── docs/ # VitePress documentation site +│ ├── .vitepress/config.mts # Nav/sidebar config for all 13 locales +│ ├── public/ # Static assets served at site root (incl. llms.txt) +│ ├── en/ # English (structural source of truth) +│ │ ├── lectures/ # 12 lectures, each with index.md + code/ +│ │ ├── projects/ # 6 project descriptions +│ │ └── resources/ # Templates, references, OpenAI advanced pack +│ └── / # 12 mirrored locales: zh, zh-TW, ja, ko, es, fr, +│ # ru, de, ar, vi, uz, tr, pt-BR +├── projects/ +│ ├── shared/ # Shared Electron + TS + React foundation +│ └── project-NN/ # Per-project starter/ and solution/ +├── skills/ +│ └── harness-creator/ # Scaffolds + validates harnesses +├── scripts/ # Dev tooling (PDF build, llms.txt generator, …) +├── CLAUDE.md # Claude Code instructions for this repo +└── README.md # Course overview (mirrored to llms.txt) +``` + +--- + +## The harness pattern (one paragraph) + +The model decides what code to write; the harness governs when, where, and how it writes it. A harness has five subsystems: **instructions** (AGENTS.md / CLAUDE.md / feature_list.json — what to do, in what order), **state** (progress.md / git log / session-handoff.md — what's been done), **verification** (tests + lint + type-check + smoke runs — only a passing suite counts as evidence), **scope** (one feature at a time, with an explicit definition of done), and **session lifecycle** (init.sh at start, clean-state checklist at end). Without the harness the agent says "done" when nothing works; with it, the agent stops only when verification passes. + +--- + +## Lecture summaries + +### L01 — Why capable agents still fail +Source: https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-01-why-capable-agents-still-fail/index.md + +Strong models pass benchmarks but fail real engineering tasks because the failure mode is environmental, not capability-based. When something breaks, the cheapest first move is not to upgrade the model — it is to inspect the harness. If the same model succeeds on similar but well-structured tasks, the bottleneck is harness, not weights. A well-crafted `AGENTS.md` often beats a more expensive model. Anthropic's controlled experiment (Opus 4.5, same prompt, "build a 2D retro game editor") shows the same model produced an unusable artifact in 20 minutes without a harness and a playable game in 6 hours with a planner + generator + evaluator harness — qualitative, not marginal. + +### L02 — What a harness actually is +Source: https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-02-what-a-harness-actually-is/index.md + +"Harness" is widely misused as a synonym for prompt file. A prompt file is not a harness. A harness is the set of files, tools, environment configuration, persisted state, and feedback loops that constrain how the agent operates. OpenAI's framing — *"the repo IS the spec"* — captures the implication: every piece of context the agent needs must live in the repository, delivered through structured instruction files, explicit verification commands, and clear directory organization. The five subsystems are: instructions, tools, environment, state, feedback. + +### L03 — The repo is the single source of truth +If the agent can't see something in the repo, it doesn't exist for the agent. External knowledge, Slack threads, design docs, and tribal memory are invisible. Everything load-bearing has to be a file the agent reads at session start. + +### L04 — Split instructions across files +One giant instruction file fails because the agent over-summarizes it and loses the parts that matter. Progressive disclosure works: a short entrypoint (`AGENTS.md`) that hands off to focused documents the agent fetches on demand. Give the agent a map, not an encyclopedia. + +### L05 — Keep context alive across sessions +Long-running tasks span multiple sessions; in-memory context evaporates between them. Persist progress to disk (`claude-progress.md`, `session-handoff.md`) so the next run resumes exactly where the last one stopped instead of starting fresh and redoing or re-breaking work. + +### L06 — Initialize before every agent session +Initialization deserves its own phase. `init.sh` runs install + verify + health-check before the agent touches code. A failed init blocks the session — that is the point. Verifying environment health up front is cheaper than debugging it inside the work. + +### L07 — Draw clear task boundaries +Agents overreach and under-finish when the scope is fuzzy. Constrain each session to exactly one unfinished feature, with an explicit definition of done. No half-finishing three things. No silently rewriting the feature list to hide unfinished work. + +### L08 — Feature lists as harness primitives +A `feature_list.json` is a machine-readable scope boundary the agent cannot quietly violate. It enumerates features and their status (`pending` / `in_progress` / `done` / `blocked`). The agent picks one, finishes it under verification, then updates the list — the list is not free-form prose for the agent to reinterpret. + +### L09 — Stop agents from declaring victory early +Source (tracked by issue #2 — may be incomplete at read time): https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-09-stop-early-victory/index.md + +Agents declare "done" because confidence ≠ correctness. The fix is a verification gap audit: the harness must require runnable evidence (test output, type-check result, smoke run log) before the agent is allowed to mark a feature complete. Without that, "done" is a hallucination. + +### L10 — Full-pipeline runs are real verification +Source (tracked by issue #2 — may be incomplete at read time): https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-10-full-pipeline-verification/index.md + +Unit tests alone don't catch integration failures. Only a full end-to-end run (install → build → tests → lint → type-check → smoke / e2e) counts as real verification. The harness defines this pipeline once; the agent executes it after every feature. + +### L11 — Observability belongs inside the harness +If you can't see what the agent did during a session, you can't fix what it broke. Logs, diffs, command transcripts, and the progress file are the observability surface. They live inside the harness, not in some external dashboard the agent never writes to. + +### L12 — Every session leaves a clean state +The next session's success depends on this session's cleanup. Wrap-up steps: update `progress.md`, update `feature_list.json`, record what's still broken or unverified, commit only when safe to resume, write the handoff note. A messy exit poisons the next start. + +--- + +## Project arc (the capstone app) + +All six projects evolve the same product: an Electron-based personal knowledge-base desktop app with document import, library management, indexing, AI-powered Q&A, and citation-grounded answers. The architecture is fixed (main / preload / renderer / services / shared types); what changes across projects is the harness around it. + +- **P01** runs the same task twice (prompt-only vs. rules-first) so you see the gap with your own eyes; the minimal harness is `AGENTS.md` + `init.sh` + `feature_list.json`. +- **P02** restructures the repo so the agent can read it — agent-readable workspace, persistent state files. +- **P03** wires session continuity — progress log, handoff note, multi-session resumability. +- **P04** adds runtime feedback and scope control — incremental indexing as the worked example. +- **P05** makes the agent verify its own work — grounded Q&A with evidence-based completion. +- **P06** assembles the full harness with observability and an ablation study (capstone). + +Each project's `solution/` is the next project's `starter/`. The app grows; the harness skills grow with it. + +--- + +## harness-creator skill (one-paragraph reference) + +`skills/harness-creator/` is a reusable skill that scaffolds and validates harnesses for any project. `create-harness.mjs` detects common project types and package managers (Node / npm / pnpm / yarn / bun, Python, Go, Rust, Maven, Gradle, .NET) and emits `AGENTS.md` (or `CLAUDE.md`), `feature_list.json`, `progress.md`, `init.sh`, and `session-handoff.md`. `validate-harness.mjs` scores the five subsystems structurally and `run-benchmark.mjs` renders an HTML assessment report. The scripts use only Node.js built-ins, so the directory can be copied into another repository and run directly. The skill's boundary: it is for harness engineering, not model selection, prompt tuning alone, or app architecture — project-specific facts stay in the target repo. + +Install: +``` +npx skills add walkinglabs/learn-harness-engineering --skill harness-creator +``` + +Run: +``` +node skills/harness-creator/scripts/create-harness.mjs --target /path/to/project +node skills/harness-creator/scripts/validate-harness.mjs --target /path/to/project +node skills/harness-creator/scripts/run-benchmark.mjs --target /path/to/project --html /path/to/report.html +``` + +--- + +## Agent session lifecycle (canonical sequence) + +``` +START + 1. Agent reads AGENTS.md / CLAUDE.md + 2. Agent runs init.sh (install, verify, health check) + 3. Agent reads claude-progress.md (what happened last time) + 4. Agent reads feature_list.json (what's done, what's next) + 5. Agent checks git log (recent changes) + +SELECT + 6. Agent picks exactly ONE unfinished feature + 7. Agent works only on that feature + +EXECUTE + 8. Agent implements the feature + 9. Agent runs verification (tests, lint, type-check) +10. If verification fails: fix and re-run +11. If verification passes: record evidence + +WRAP UP +12. Agent updates claude-progress.md +13. Agent updates feature_list.json +14. Agent records what's still broken or unverified +15. Agent commits (only when safe to resume) +16. Agent leaves clean restart path for next session +``` + +--- + +## Core external references + +- OpenAI — Harness engineering: leveraging Codex in an agent-first world. +- Anthropic — Effective harnesses for long-running agents. +- Anthropic — Harness design for long-running application development. +- OpenAI — Unrolling the Codex agent loop. +- Anthropic — Demystifying evals for AI agents. +- LangChain — Improving Deep Agents with harness engineering. +- Thoughtworks / Martin Fowler — Harness engineering for coding agent users. +- Cursor — Continually improving our agent harness. +- Companion repo — https://github.com/walkinglabs/awesome-harness-engineering + +For the full layered reading list see `docs/en/resources/reference/` in the repository. + +--- + +## When NOT to use this course + +- Zero-code AI introductions. +- Prompt-only workflows with no intention to build real implementations. +- Environments where the agent cannot read/write files or run commands. + +--- + +*This file is generated. Edit `llms.txt` and the lecture index files; regenerate with `npm run llms:build`.* diff --git a/docs/public/llms.txt b/docs/public/llms.txt new file mode 100644 index 00000000..74a99ad1 --- /dev/null +++ b/docs/public/llms.txt @@ -0,0 +1,75 @@ +# Learn Harness Engineering + +> A project-based course on building the environment, state management, verification, and control mechanisms that make AI coding agents work reliably. The course argues that "the strongest model in the world will still fail on real engineering tasks if you don't build a proper environment around it" — and teaches the five-subsystem harness (instructions, state, verification, scope, lifecycle) that fixes it. + +The repository ships three things you can consume: + +- A 12-lecture / 6-project curriculum delivered as a VitePress site, available in 13 languages (English as the structural source of truth; other locales mirror the English tree under `docs//`). +- A capstone Electron + TypeScript + React knowledge-base app that evolves across the 6 projects (`projects/project-NN/starter` and `projects/project-NN/solution`). +- A reusable `harness-creator` skill (`skills/harness-creator/`) that scaffolds and validates a production-grade harness (AGENTS.md, feature_list.json, init.sh, progress.md, session-handoff.md) for any project. + +Conventions an LLM consuming this file should know: +- `AGENTS.md` / `CLAUDE.md` are the agent operating manuals at a repo root. +- `feature_list.json` defines machine-readable scope; only one feature is in flight at a time. +- `init.sh` runs install + verify + health-check at session start; `progress.md` and `session-handoff.md` carry state between sessions. +- Verification = a passing test suite, not the agent's self-assessment. + +## Start here + +- [Course README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/README.md): Course overview, learning path, syllabus, and how the 6 projects evolve the same Electron app. +- [Documentation site (English)](https://walkinglabs.github.io/learn-harness-engineering/): Rendered VitePress site — the canonical reading experience. +- [CLAUDE.md (repo root)](https://github.com/walkinglabs/learn-harness-engineering/blob/main/CLAUDE.md): How Claude Code should operate inside this repository (commands, structure, key patterns). + +## Lectures + +- [Lecture 01 — Why capable agents still fail](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-01-why-capable-agents-still-fail/index.md): The capability gap between benchmarks and real engineering; when something fails, check the harness before swapping the model. +- [Lecture 02 — What a harness actually is](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-02-what-a-harness-actually-is/index.md): A precise definition: five subsystems — instructions, tools, environment, state, feedback. A prompt file is not a harness. +- [Lecture 03 — The repo is the single source of truth](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-03-repo-as-source-of-truth/index.md): If the agent can't see it in the repo, it doesn't exist for the agent. +- [Lecture 04 — Split instructions across files](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-04-split-instructions/index.md): Progressive disclosure — give the agent a map, not an encyclopedia. +- [Lecture 05 — Keep context alive across sessions](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-05-keep-context-alive/index.md): Persist progress to disk; pick up exactly where the previous session left off. +- [Lecture 06 — Initialize before every agent session](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-06-initialize-session/index.md): Verify the environment is healthy before the agent starts work. +- [Lecture 07 — Draw clear task boundaries](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-07-task-boundaries/index.md): One feature at a time; an explicit definition of done. +- [Lecture 08 — Feature lists as harness primitives](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-08-feature-lists/index.md): Machine-readable scope boundaries the agent can't quietly rewrite. +- [Lecture 09 — Stop agents from declaring victory early](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-09-stop-early-victory/index.md): Verification gaps — confidence is not correctness. +- [Lecture 10 — Full-pipeline runs are real verification](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-10-full-pipeline-verification/index.md): Only an end-to-end run counts as evidence. +- [Lecture 11 — Observability belongs inside the harness](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-11-observability/index.md): If you can't see what the agent did, you can't fix what it broke. +- [Lecture 12 — Every session leaves a clean state](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-12-clean-handoff/index.md): The next session's success depends on this session's cleanup. + +## Projects + +- [Project 01 — Prompt-only vs. rules-first](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-01): Minimal harness — AGENTS.md + init.sh + feature_list.json — and the measured difference. +- [Project 02 — Agent-readable workspace](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-02): Restructure the repo so the agent can navigate it; persistent state files. +- [Project 03 — Multi-session continuity](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-03): Progress log + session handoff so the next run picks up exactly where the last one stopped. +- [Project 04 — Runtime feedback and scope](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-04): Stop the agent from doing too much or too little; incremental indexing. +- [Project 05 — Self-verification](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-05): Grounded Q&A; evidence-based completion; the agent verifies its own work. +- [Project 06 — Build a complete harness (capstone)](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-06): Full harness with observability and an ablation study. +- [Shared Electron foundation](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/shared): The TypeScript + React + Electron base that every project starter/solution derives from. + +## Skills + +- [harness-creator README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/skills/harness-creator/README.md): A compact skill that scaffolds AGENTS.md, feature_list.json, init.sh, progress.md, and session-handoff.md; detects common project types; validates the five subsystems and emits an HTML report. +- [harness-creator SKILL.md](https://github.com/walkinglabs/learn-harness-engineering/blob/main/skills/harness-creator/SKILL.md): The skill manifest consumed by agent runtimes. +- [Skills directory README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/skills/README.md): How the skill was built with the skill-creator methodology (draft → test → evaluate → iterate). + +## Resources (templates and references) + +- [English resource library](https://github.com/walkinglabs/learn-harness-engineering/tree/main/docs/en/resources): Copy-ready `AGENTS.md`, `feature_list.json`, `init.sh`, `progress.md`, `session-handoff.md` plus reference reading lists. +- [Core references — OpenAI / Anthropic / LangChain / Cursor](https://github.com/walkinglabs/learn-harness-engineering/blob/main/README.md#core-references): Primary external sources the course synthesizes. +- [Awesome Harness Engineering](https://github.com/walkinglabs/awesome-harness-engineering): Curated list of articles, benchmarks, and open-source harnesses maintained by the same author. + +## Optional + +- [简体中文 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/zh/README.md): Simplified Chinese landing page. +- [繁體中文 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/zh-TW/README.md): Traditional Chinese landing page. +- [日本語 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ja/README.md): Japanese landing page. +- [한국어 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ko/README.md): Korean landing page. +- [Español README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/es/README.md): Spanish landing page. +- [Français README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/fr/README.md): French landing page. +- [Deutsch README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/de/README.md): German landing page. +- [Русский README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ru/README.md): Russian landing page. +- [العربية README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ar/README.md): Arabic landing page. +- [Tiếng Việt README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/vi/README.md): Vietnamese landing page. +- [Oʻzbekcha README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/uz/README.md): Uzbek landing page. +- [Türkçe README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/tr/README.md): Turkish landing page. +- [Portuguese (BR) README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/pt-BR/README.md): Brazilian Portuguese landing page. +- [Expanded context bundle (llms-full.txt)](https://walkinglabs.github.io/learn-harness-engineering/llms-full.txt): A single file that inlines the README and the 12 lecture summaries for one-shot ingestion. diff --git a/llms-full.txt b/llms-full.txt new file mode 100644 index 00000000..00dbe699 --- /dev/null +++ b/llms-full.txt @@ -0,0 +1,182 @@ +# Learn Harness Engineering — Expanded Context Bundle + +> This is the long-form companion to `/llms.txt`. It inlines the project README and concise per-lecture summaries so an LLM can answer questions about the course in a single pass without dereferencing every link. URLs preserved from the index let you fetch the full source on demand. + +--- + +## About this bundle + +- Source repository: https://github.com/walkinglabs/learn-harness-engineering +- Documentation site: https://walkinglabs.github.io/learn-harness-engineering/ +- License & contribution policy: see the repository root. +- Status: course is actively evolving (v0.1.x). Lectures 09 and 10 are tracked in issue #2 — content may still be in flight at read time. + +--- + +## Repository structure + +``` +learn-harness-engineering/ +├── docs/ # VitePress documentation site +│ ├── .vitepress/config.mts # Nav/sidebar config for all 13 locales +│ ├── public/ # Static assets served at site root (incl. llms.txt) +│ ├── en/ # English (structural source of truth) +│ │ ├── lectures/ # 12 lectures, each with index.md + code/ +│ │ ├── projects/ # 6 project descriptions +│ │ └── resources/ # Templates, references, OpenAI advanced pack +│ └── / # 12 mirrored locales: zh, zh-TW, ja, ko, es, fr, +│ # ru, de, ar, vi, uz, tr, pt-BR +├── projects/ +│ ├── shared/ # Shared Electron + TS + React foundation +│ └── project-NN/ # Per-project starter/ and solution/ +├── skills/ +│ └── harness-creator/ # Scaffolds + validates harnesses +├── scripts/ # Dev tooling (PDF build, llms.txt generator, …) +├── CLAUDE.md # Claude Code instructions for this repo +└── README.md # Course overview (mirrored to llms.txt) +``` + +--- + +## The harness pattern (one paragraph) + +The model decides what code to write; the harness governs when, where, and how it writes it. A harness has five subsystems: **instructions** (AGENTS.md / CLAUDE.md / feature_list.json — what to do, in what order), **state** (progress.md / git log / session-handoff.md — what's been done), **verification** (tests + lint + type-check + smoke runs — only a passing suite counts as evidence), **scope** (one feature at a time, with an explicit definition of done), and **session lifecycle** (init.sh at start, clean-state checklist at end). Without the harness the agent says "done" when nothing works; with it, the agent stops only when verification passes. + +--- + +## Lecture summaries + +### L01 — Why capable agents still fail +Source: https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-01-why-capable-agents-still-fail/index.md + +Strong models pass benchmarks but fail real engineering tasks because the failure mode is environmental, not capability-based. When something breaks, the cheapest first move is not to upgrade the model — it is to inspect the harness. If the same model succeeds on similar but well-structured tasks, the bottleneck is harness, not weights. A well-crafted `AGENTS.md` often beats a more expensive model. Anthropic's controlled experiment (Opus 4.5, same prompt, "build a 2D retro game editor") shows the same model produced an unusable artifact in 20 minutes without a harness and a playable game in 6 hours with a planner + generator + evaluator harness — qualitative, not marginal. + +### L02 — What a harness actually is +Source: https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-02-what-a-harness-actually-is/index.md + +"Harness" is widely misused as a synonym for prompt file. A prompt file is not a harness. A harness is the set of files, tools, environment configuration, persisted state, and feedback loops that constrain how the agent operates. OpenAI's framing — *"the repo IS the spec"* — captures the implication: every piece of context the agent needs must live in the repository, delivered through structured instruction files, explicit verification commands, and clear directory organization. The five subsystems are: instructions, tools, environment, state, feedback. + +### L03 — The repo is the single source of truth +If the agent can't see something in the repo, it doesn't exist for the agent. External knowledge, Slack threads, design docs, and tribal memory are invisible. Everything load-bearing has to be a file the agent reads at session start. + +### L04 — Split instructions across files +One giant instruction file fails because the agent over-summarizes it and loses the parts that matter. Progressive disclosure works: a short entrypoint (`AGENTS.md`) that hands off to focused documents the agent fetches on demand. Give the agent a map, not an encyclopedia. + +### L05 — Keep context alive across sessions +Long-running tasks span multiple sessions; in-memory context evaporates between them. Persist progress to disk (`claude-progress.md`, `session-handoff.md`) so the next run resumes exactly where the last one stopped instead of starting fresh and redoing or re-breaking work. + +### L06 — Initialize before every agent session +Initialization deserves its own phase. `init.sh` runs install + verify + health-check before the agent touches code. A failed init blocks the session — that is the point. Verifying environment health up front is cheaper than debugging it inside the work. + +### L07 — Draw clear task boundaries +Agents overreach and under-finish when the scope is fuzzy. Constrain each session to exactly one unfinished feature, with an explicit definition of done. No half-finishing three things. No silently rewriting the feature list to hide unfinished work. + +### L08 — Feature lists as harness primitives +A `feature_list.json` is a machine-readable scope boundary the agent cannot quietly violate. It enumerates features and their status (`pending` / `in_progress` / `done` / `blocked`). The agent picks one, finishes it under verification, then updates the list — the list is not free-form prose for the agent to reinterpret. + +### L09 — Stop agents from declaring victory early +Source (tracked by issue #2 — may be incomplete at read time): https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-09-stop-early-victory/index.md + +Agents declare "done" because confidence ≠ correctness. The fix is a verification gap audit: the harness must require runnable evidence (test output, type-check result, smoke run log) before the agent is allowed to mark a feature complete. Without that, "done" is a hallucination. + +### L10 — Full-pipeline runs are real verification +Source (tracked by issue #2 — may be incomplete at read time): https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-10-full-pipeline-verification/index.md + +Unit tests alone don't catch integration failures. Only a full end-to-end run (install → build → tests → lint → type-check → smoke / e2e) counts as real verification. The harness defines this pipeline once; the agent executes it after every feature. + +### L11 — Observability belongs inside the harness +If you can't see what the agent did during a session, you can't fix what it broke. Logs, diffs, command transcripts, and the progress file are the observability surface. They live inside the harness, not in some external dashboard the agent never writes to. + +### L12 — Every session leaves a clean state +The next session's success depends on this session's cleanup. Wrap-up steps: update `progress.md`, update `feature_list.json`, record what's still broken or unverified, commit only when safe to resume, write the handoff note. A messy exit poisons the next start. + +--- + +## Project arc (the capstone app) + +All six projects evolve the same product: an Electron-based personal knowledge-base desktop app with document import, library management, indexing, AI-powered Q&A, and citation-grounded answers. The architecture is fixed (main / preload / renderer / services / shared types); what changes across projects is the harness around it. + +- **P01** runs the same task twice (prompt-only vs. rules-first) so you see the gap with your own eyes; the minimal harness is `AGENTS.md` + `init.sh` + `feature_list.json`. +- **P02** restructures the repo so the agent can read it — agent-readable workspace, persistent state files. +- **P03** wires session continuity — progress log, handoff note, multi-session resumability. +- **P04** adds runtime feedback and scope control — incremental indexing as the worked example. +- **P05** makes the agent verify its own work — grounded Q&A with evidence-based completion. +- **P06** assembles the full harness with observability and an ablation study (capstone). + +Each project's `solution/` is the next project's `starter/`. The app grows; the harness skills grow with it. + +--- + +## harness-creator skill (one-paragraph reference) + +`skills/harness-creator/` is a reusable skill that scaffolds and validates harnesses for any project. `create-harness.mjs` detects common project types and package managers (Node / npm / pnpm / yarn / bun, Python, Go, Rust, Maven, Gradle, .NET) and emits `AGENTS.md` (or `CLAUDE.md`), `feature_list.json`, `progress.md`, `init.sh`, and `session-handoff.md`. `validate-harness.mjs` scores the five subsystems structurally and `run-benchmark.mjs` renders an HTML assessment report. The scripts use only Node.js built-ins, so the directory can be copied into another repository and run directly. The skill's boundary: it is for harness engineering, not model selection, prompt tuning alone, or app architecture — project-specific facts stay in the target repo. + +Install: +``` +npx skills add walkinglabs/learn-harness-engineering --skill harness-creator +``` + +Run: +``` +node skills/harness-creator/scripts/create-harness.mjs --target /path/to/project +node skills/harness-creator/scripts/validate-harness.mjs --target /path/to/project +node skills/harness-creator/scripts/run-benchmark.mjs --target /path/to/project --html /path/to/report.html +``` + +--- + +## Agent session lifecycle (canonical sequence) + +``` +START + 1. Agent reads AGENTS.md / CLAUDE.md + 2. Agent runs init.sh (install, verify, health check) + 3. Agent reads claude-progress.md (what happened last time) + 4. Agent reads feature_list.json (what's done, what's next) + 5. Agent checks git log (recent changes) + +SELECT + 6. Agent picks exactly ONE unfinished feature + 7. Agent works only on that feature + +EXECUTE + 8. Agent implements the feature + 9. Agent runs verification (tests, lint, type-check) +10. If verification fails: fix and re-run +11. If verification passes: record evidence + +WRAP UP +12. Agent updates claude-progress.md +13. Agent updates feature_list.json +14. Agent records what's still broken or unverified +15. Agent commits (only when safe to resume) +16. Agent leaves clean restart path for next session +``` + +--- + +## Core external references + +- OpenAI — Harness engineering: leveraging Codex in an agent-first world. +- Anthropic — Effective harnesses for long-running agents. +- Anthropic — Harness design for long-running application development. +- OpenAI — Unrolling the Codex agent loop. +- Anthropic — Demystifying evals for AI agents. +- LangChain — Improving Deep Agents with harness engineering. +- Thoughtworks / Martin Fowler — Harness engineering for coding agent users. +- Cursor — Continually improving our agent harness. +- Companion repo — https://github.com/walkinglabs/awesome-harness-engineering + +For the full layered reading list see `docs/en/resources/reference/` in the repository. + +--- + +## When NOT to use this course + +- Zero-code AI introductions. +- Prompt-only workflows with no intention to build real implementations. +- Environments where the agent cannot read/write files or run commands. + +--- + +*This file is generated. Edit `llms.txt` and the lecture index files; regenerate with `npm run llms:build`.* diff --git a/llms.txt b/llms.txt new file mode 100644 index 00000000..74a99ad1 --- /dev/null +++ b/llms.txt @@ -0,0 +1,75 @@ +# Learn Harness Engineering + +> A project-based course on building the environment, state management, verification, and control mechanisms that make AI coding agents work reliably. The course argues that "the strongest model in the world will still fail on real engineering tasks if you don't build a proper environment around it" — and teaches the five-subsystem harness (instructions, state, verification, scope, lifecycle) that fixes it. + +The repository ships three things you can consume: + +- A 12-lecture / 6-project curriculum delivered as a VitePress site, available in 13 languages (English as the structural source of truth; other locales mirror the English tree under `docs//`). +- A capstone Electron + TypeScript + React knowledge-base app that evolves across the 6 projects (`projects/project-NN/starter` and `projects/project-NN/solution`). +- A reusable `harness-creator` skill (`skills/harness-creator/`) that scaffolds and validates a production-grade harness (AGENTS.md, feature_list.json, init.sh, progress.md, session-handoff.md) for any project. + +Conventions an LLM consuming this file should know: +- `AGENTS.md` / `CLAUDE.md` are the agent operating manuals at a repo root. +- `feature_list.json` defines machine-readable scope; only one feature is in flight at a time. +- `init.sh` runs install + verify + health-check at session start; `progress.md` and `session-handoff.md` carry state between sessions. +- Verification = a passing test suite, not the agent's self-assessment. + +## Start here + +- [Course README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/README.md): Course overview, learning path, syllabus, and how the 6 projects evolve the same Electron app. +- [Documentation site (English)](https://walkinglabs.github.io/learn-harness-engineering/): Rendered VitePress site — the canonical reading experience. +- [CLAUDE.md (repo root)](https://github.com/walkinglabs/learn-harness-engineering/blob/main/CLAUDE.md): How Claude Code should operate inside this repository (commands, structure, key patterns). + +## Lectures + +- [Lecture 01 — Why capable agents still fail](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-01-why-capable-agents-still-fail/index.md): The capability gap between benchmarks and real engineering; when something fails, check the harness before swapping the model. +- [Lecture 02 — What a harness actually is](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-02-what-a-harness-actually-is/index.md): A precise definition: five subsystems — instructions, tools, environment, state, feedback. A prompt file is not a harness. +- [Lecture 03 — The repo is the single source of truth](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-03-repo-as-source-of-truth/index.md): If the agent can't see it in the repo, it doesn't exist for the agent. +- [Lecture 04 — Split instructions across files](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-04-split-instructions/index.md): Progressive disclosure — give the agent a map, not an encyclopedia. +- [Lecture 05 — Keep context alive across sessions](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-05-keep-context-alive/index.md): Persist progress to disk; pick up exactly where the previous session left off. +- [Lecture 06 — Initialize before every agent session](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-06-initialize-session/index.md): Verify the environment is healthy before the agent starts work. +- [Lecture 07 — Draw clear task boundaries](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-07-task-boundaries/index.md): One feature at a time; an explicit definition of done. +- [Lecture 08 — Feature lists as harness primitives](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-08-feature-lists/index.md): Machine-readable scope boundaries the agent can't quietly rewrite. +- [Lecture 09 — Stop agents from declaring victory early](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-09-stop-early-victory/index.md): Verification gaps — confidence is not correctness. +- [Lecture 10 — Full-pipeline runs are real verification](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-10-full-pipeline-verification/index.md): Only an end-to-end run counts as evidence. +- [Lecture 11 — Observability belongs inside the harness](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-11-observability/index.md): If you can't see what the agent did, you can't fix what it broke. +- [Lecture 12 — Every session leaves a clean state](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/en/lectures/lecture-12-clean-handoff/index.md): The next session's success depends on this session's cleanup. + +## Projects + +- [Project 01 — Prompt-only vs. rules-first](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-01): Minimal harness — AGENTS.md + init.sh + feature_list.json — and the measured difference. +- [Project 02 — Agent-readable workspace](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-02): Restructure the repo so the agent can navigate it; persistent state files. +- [Project 03 — Multi-session continuity](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-03): Progress log + session handoff so the next run picks up exactly where the last one stopped. +- [Project 04 — Runtime feedback and scope](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-04): Stop the agent from doing too much or too little; incremental indexing. +- [Project 05 — Self-verification](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-05): Grounded Q&A; evidence-based completion; the agent verifies its own work. +- [Project 06 — Build a complete harness (capstone)](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/project-06): Full harness with observability and an ablation study. +- [Shared Electron foundation](https://github.com/walkinglabs/learn-harness-engineering/tree/main/projects/shared): The TypeScript + React + Electron base that every project starter/solution derives from. + +## Skills + +- [harness-creator README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/skills/harness-creator/README.md): A compact skill that scaffolds AGENTS.md, feature_list.json, init.sh, progress.md, and session-handoff.md; detects common project types; validates the five subsystems and emits an HTML report. +- [harness-creator SKILL.md](https://github.com/walkinglabs/learn-harness-engineering/blob/main/skills/harness-creator/SKILL.md): The skill manifest consumed by agent runtimes. +- [Skills directory README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/skills/README.md): How the skill was built with the skill-creator methodology (draft → test → evaluate → iterate). + +## Resources (templates and references) + +- [English resource library](https://github.com/walkinglabs/learn-harness-engineering/tree/main/docs/en/resources): Copy-ready `AGENTS.md`, `feature_list.json`, `init.sh`, `progress.md`, `session-handoff.md` plus reference reading lists. +- [Core references — OpenAI / Anthropic / LangChain / Cursor](https://github.com/walkinglabs/learn-harness-engineering/blob/main/README.md#core-references): Primary external sources the course synthesizes. +- [Awesome Harness Engineering](https://github.com/walkinglabs/awesome-harness-engineering): Curated list of articles, benchmarks, and open-source harnesses maintained by the same author. + +## Optional + +- [简体中文 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/zh/README.md): Simplified Chinese landing page. +- [繁體中文 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/zh-TW/README.md): Traditional Chinese landing page. +- [日本語 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ja/README.md): Japanese landing page. +- [한국어 README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ko/README.md): Korean landing page. +- [Español README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/es/README.md): Spanish landing page. +- [Français README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/fr/README.md): French landing page. +- [Deutsch README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/de/README.md): German landing page. +- [Русский README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ru/README.md): Russian landing page. +- [العربية README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/ar/README.md): Arabic landing page. +- [Tiếng Việt README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/vi/README.md): Vietnamese landing page. +- [Oʻzbekcha README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/uz/README.md): Uzbek landing page. +- [Türkçe README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/tr/README.md): Turkish landing page. +- [Portuguese (BR) README](https://github.com/walkinglabs/learn-harness-engineering/blob/main/docs/pt-BR/README.md): Brazilian Portuguese landing page. +- [Expanded context bundle (llms-full.txt)](https://walkinglabs.github.io/learn-harness-engineering/llms-full.txt): A single file that inlines the README and the 12 lecture summaries for one-shot ingestion. diff --git a/package.json b/package.json index 253df40d..4c450085 100644 --- a/package.json +++ b/package.json @@ -8,12 +8,14 @@ "build": "vitepress build docs", "preview": "vitepress preview docs", "docs:dev": "vitepress dev docs", - "docs:build": "vitepress build docs", + "docs:build": "npm run llms:build && vitepress build docs", "docs:preview": "vitepress preview docs", "lecture:run": "tsx", "screenshots:readme": "node --import tsx scripts/capture-readme-screenshots.ts", "pdf:export": "node --import tsx scripts/build-course-pdfs.ts", - "pdf:build": "npm run docs:build && npm run pdf:export" + "pdf:build": "npm run docs:build && npm run pdf:export", + "llms:build": "node scripts/build-llms-txt.mjs", + "llms:check": "node scripts/build-llms-txt.mjs --check" }, "devDependencies": { "mermaid": "^11.14.0", diff --git a/scripts/build-llms-txt.mjs b/scripts/build-llms-txt.mjs new file mode 100755 index 00000000..48f13055 --- /dev/null +++ b/scripts/build-llms-txt.mjs @@ -0,0 +1,77 @@ +#!/usr/bin/env node +// scripts/build-llms-txt.mjs +// +// Sync the canonical /llms.txt and /llms-full.txt (at repo root) into +// docs/public/ so VitePress serves them at the site root. +// +// Why a copy and not symlinks: VitePress's static asset pipeline only +// picks up real files under docs/public/, and symlinks break on Windows +// contributors. The repo-root copy is the spec-required location +// (https://llmstxt.org), the docs/public/ copy is the served artifact. +// +// Usage: +// node scripts/build-llms-txt.mjs # copy + verify +// node scripts/build-llms-txt.mjs --check # verify only (CI guard) +// +// No third-party deps; Node 18+ built-ins only. + +import { readFile, writeFile, mkdir, stat } from 'node:fs/promises'; +import { existsSync } from 'node:fs'; +import { dirname, resolve } from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const __filename = fileURLToPath(import.meta.url); +const repoRoot = resolve(dirname(__filename), '..'); + +const FILES = ['llms.txt', 'llms-full.txt']; +const PUBLIC_DIR = resolve(repoRoot, 'docs', 'public'); + +const checkOnly = process.argv.includes('--check'); + +async function ensureDir(dir) { + if (!existsSync(dir)) { + await mkdir(dir, { recursive: true }); + } +} + +async function readIfExists(path) { + try { + await stat(path); + return await readFile(path, 'utf8'); + } catch { + return null; + } +} + +let mismatch = false; + +for (const name of FILES) { + const src = resolve(repoRoot, name); + const dst = resolve(PUBLIC_DIR, name); + + const srcContent = await readIfExists(src); + if (srcContent === null) { + console.error(`[llms-txt] ERROR: missing source file ${src}`); + process.exit(1); + } + + if (checkOnly) { + const dstContent = await readIfExists(dst); + if (dstContent !== srcContent) { + console.error( + `[llms-txt] DRIFT: docs/public/${name} is out of sync with /${name}. ` + + `Run \`npm run llms:build\` and commit the result.` + ); + mismatch = true; + } + continue; + } + + await ensureDir(PUBLIC_DIR); + await writeFile(dst, srcContent); + console.log(`[llms-txt] wrote docs/public/${name} (${srcContent.length} bytes)`); +} + +if (checkOnly && mismatch) { + process.exit(1); +}