From f3f6ac9a7d0ce0f712c2c594f921fc400d9a6e8b Mon Sep 17 00:00:00 2001 From: Federico Bartoli Date: Mon, 13 Apr 2026 19:53:32 +0200 Subject: [PATCH 1/3] Validate llms.txt has the required H1 heading The llms.txt spec requires a single H1 with the project or site name as the first element in the ordered structure. The checker didn't verify this, so a file with no H1 (or multiple H1s) passed as well-formed. - error when no H1 is present - warning when multiple H1s are present - warning when the H1 is not the first content in the file Fenced code blocks are stripped before matching so '# comment' lines inside bash etc. aren't counted as H1s. Spec: https://llmstxt.org --- docs/checkers.md | 2 ++ src/checkers/llms-txt.js | 32 +++++++++++++++++++++++++++++++ test/checkers/llms-txt.test.js | 8 ++++++++ test/fixtures/llms-no-h1/llms.txt | 9 +++++++++ 4 files changed, 51 insertions(+) create mode 100644 test/fixtures/llms-no-h1/llms.txt diff --git a/docs/checkers.md b/docs/checkers.md index d8e67e6..c84e2dc 100644 --- a/docs/checkers.md +++ b/docs/checkers.md @@ -49,6 +49,8 @@ Checks for a well-formed `llms.txt` file, which serves as a structured index for - File itself is under 5,000 tokens (+1) - Organized with section headings (+1) +Also validates that the file has exactly one H1 heading placed first, as required by the spec. Violations are reported as findings (missing H1 → error; multiple H1s or H1 not first → warning). + **Specification:** https://llmstxt.org --- diff --git a/src/checkers/llms-txt.js b/src/checkers/llms-txt.js index fb9d2b2..a868478 100644 --- a/src/checkers/llms-txt.js +++ b/src/checkers/llms-txt.js @@ -7,6 +7,34 @@ const NAME = 'llms.txt Discovery Index'; const CATEGORY = 'discovery'; const MAX_SCORE = 10; +/** + * Validate the H1 heading against the llms.txt spec. + * The spec requires exactly one H1 and places it first in the ordered structure. + */ +function validateH1(content) { + const safe = content || ''; + // Strip fenced code blocks so '# comment' lines inside bash/etc. aren't counted as H1. + const stripped = safe.replace(/^([`~]{3,})[^\n]*\n[\s\S]*?^\1[^\n]*$/gm, ''); + const h1s = stripped.match(/^#\s+\S.*$/gm) || []; + const firstNonBlank = stripped.split('\n').find((l) => l.trim() !== '') || ''; + const startsWithH1 = /^#\s+\S/.test(firstNonBlank); + + if (h1s.length === 0) { + return finding('error', 'llms.txt is missing the required H1 heading.', + 'Add a top-level heading with your project or site name as the first line:\n# My Project'); + } + if (h1s.length > 1) { + return finding('warning', + `llms.txt contains ${h1s.length} H1 headings; the spec expects exactly one.`, + 'Keep a single top-level "# Project Name" heading. Use H2 (##) for sections.'); + } + if (!startsWithH1) { + return finding('warning', 'llms.txt H1 heading is not the first content in the file.', + 'Move the "# Project Name" heading to the top; the spec expects it before the blockquote summary and sections.'); + } + return finding('info', `H1 heading present: "${h1s[0].replace(/^#\s+/, '').trim()}".`); +} + /** * Check llms.txt exists and is well-formed. * @@ -48,6 +76,10 @@ export async function check(context) { score += 3; findings.push(finding('info', `llms.txt found at ${foundPath}.`)); + // Spec: single H1 with the project/site name, first in the ordered structure. + // https://llmstxt.org + findings.push(validateH1(content)); + // Check for structured links [title](url) const linkPattern = /\[([^\]]+)\]\(([^)]+)\)/g; const links = [...content.matchAll(linkPattern)]; diff --git a/test/checkers/llms-txt.test.js b/test/checkers/llms-txt.test.js index 1ab5b7d..09162fb 100644 --- a/test/checkers/llms-txt.test.js +++ b/test/checkers/llms-txt.test.js @@ -26,4 +26,12 @@ describe('llms-txt checker', () => { const withFix = result.findings.filter((f) => f.fix); assert.ok(withFix.length > 0, 'Should provide fix suggestions'); }); + + it('should report an error when llms.txt has no H1 heading', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-no-h1'), projectDir: join(FIXTURES, 'llms-no-h1') }); + const h1Error = result.findings.find( + (f) => f.severity === 'error' && /H1/.test(f.message) + ); + assert.ok(h1Error, 'Should report a missing-H1 error finding'); + }); }); diff --git a/test/fixtures/llms-no-h1/llms.txt b/test/fixtures/llms-no-h1/llms.txt new file mode 100644 index 0000000..583f463 --- /dev/null +++ b/test/fixtures/llms-no-h1/llms.txt @@ -0,0 +1,9 @@ +> A site with an llms.txt that has links and sections but no H1. + +## Getting Started + +- [Quickstart](https://example.com/quickstart): Get going in 5 minutes (~500 tokens) + +## API Reference + +- [Users API](https://example.com/api/users): CRUD for user accounts (~800 tokens) From 7332ca822435080eadc30218ba2cdf121ccde147 Mon Sep 17 00:00:00 2001 From: Federico Bartoli Date: Mon, 13 Apr 2026 20:11:07 +0200 Subject: [PATCH 2/3] Fix H1 position check and allow CommonMark heading indentation - Check the H1 position against the original content, not the code-block-stripped version. Previously, a file starting with a fenced code block followed by an H1 was treated as "H1 first" because stripping hoisted the H1 to the top of the analyzed text. - Allow up to 3 spaces of leading indentation before the '#' as CommonMark does. Adds a regression fixture (code block before the H1) that now correctly produces the "not the first content" warning. --- src/checkers/llms-txt.js | 15 ++++++++++----- test/checkers/llms-txt.test.js | 8 ++++++++ test/fixtures/llms-h1-not-first/llms.txt | 12 ++++++++++++ 3 files changed, 30 insertions(+), 5 deletions(-) create mode 100644 test/fixtures/llms-h1-not-first/llms.txt diff --git a/src/checkers/llms-txt.js b/src/checkers/llms-txt.js index a868478..3aa6056 100644 --- a/src/checkers/llms-txt.js +++ b/src/checkers/llms-txt.js @@ -13,11 +13,16 @@ const MAX_SCORE = 10; */ function validateH1(content) { const safe = content || ''; - // Strip fenced code blocks so '# comment' lines inside bash/etc. aren't counted as H1. + // For counting H1s, strip fenced code blocks so '# comment' lines inside + // bash/etc. aren't matched. const stripped = safe.replace(/^([`~]{3,})[^\n]*\n[\s\S]*?^\1[^\n]*$/gm, ''); - const h1s = stripped.match(/^#\s+\S.*$/gm) || []; - const firstNonBlank = stripped.split('\n').find((l) => l.trim() !== '') || ''; - const startsWithH1 = /^#\s+\S/.test(firstNonBlank); + // CommonMark allows up to 3 spaces of indentation before an ATX heading. + const h1Regex = /^[ ]{0,3}#\s+\S.*$/gm; + const h1s = stripped.match(h1Regex) || []; + // For position, check the original content: a code block at the top still + // counts as content before the H1. + const firstNonBlank = safe.split('\n').find((l) => l.trim() !== '') || ''; + const startsWithH1 = /^[ ]{0,3}#\s+\S/.test(firstNonBlank); if (h1s.length === 0) { return finding('error', 'llms.txt is missing the required H1 heading.', @@ -32,7 +37,7 @@ function validateH1(content) { return finding('warning', 'llms.txt H1 heading is not the first content in the file.', 'Move the "# Project Name" heading to the top; the spec expects it before the blockquote summary and sections.'); } - return finding('info', `H1 heading present: "${h1s[0].replace(/^#\s+/, '').trim()}".`); + return finding('info', `H1 heading present: "${h1s[0].replace(/^\s*#\s+/, '').trim()}".`); } /** diff --git a/test/checkers/llms-txt.test.js b/test/checkers/llms-txt.test.js index 09162fb..b0bf3ef 100644 --- a/test/checkers/llms-txt.test.js +++ b/test/checkers/llms-txt.test.js @@ -34,4 +34,12 @@ describe('llms-txt checker', () => { ); assert.ok(h1Error, 'Should report a missing-H1 error finding'); }); + + it('should warn when the H1 is not the first content (code block above it)', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-h1-not-first'), projectDir: join(FIXTURES, 'llms-h1-not-first') }); + const positionWarning = result.findings.find( + (f) => f.severity === 'warning' && /not the first content/.test(f.message) + ); + assert.ok(positionWarning, 'Should warn that the H1 is not the first content in the file'); + }); }); diff --git a/test/fixtures/llms-h1-not-first/llms.txt b/test/fixtures/llms-h1-not-first/llms.txt new file mode 100644 index 0000000..4e6e1ab --- /dev/null +++ b/test/fixtures/llms-h1-not-first/llms.txt @@ -0,0 +1,12 @@ +```bash +# install +npm install example +``` + +# ExampleDocs + +> A project whose llms.txt starts with a code block before the H1. + +## Getting Started + +- [Quickstart](https://example.com/quickstart): First steps (~500 tokens) From 698d70ed050769ce8fc049593424add383c0abaf Mon Sep 17 00:00:00 2001 From: Federico Bartoli Date: Mon, 13 Apr 2026 20:18:00 +0200 Subject: [PATCH 3/3] Handle UTF-8 BOM and setext H1 in llms.txt validation - Strip a leading BOM before running the position check, so files saved by editors that add one are not incorrectly flagged as "H1 not first content". - Normalize setext H1 syntax (Title\n=====) to ATX before matching, so a spec-compliant setext H1 is recognized. Fixtures and tests added for both cases. --- src/checkers/llms-txt.js | 17 ++++++++++++----- test/checkers/llms-txt.test.js | 16 ++++++++++++++++ test/fixtures/llms-bom/llms.txt | 7 +++++++ test/fixtures/llms-setext/llms.txt | 8 ++++++++ 4 files changed, 43 insertions(+), 5 deletions(-) create mode 100644 test/fixtures/llms-bom/llms.txt create mode 100644 test/fixtures/llms-setext/llms.txt diff --git a/src/checkers/llms-txt.js b/src/checkers/llms-txt.js index 3aa6056..19ab098 100644 --- a/src/checkers/llms-txt.js +++ b/src/checkers/llms-txt.js @@ -12,16 +12,23 @@ const MAX_SCORE = 10; * The spec requires exactly one H1 and places it first in the ordered structure. */ function validateH1(content) { - const safe = content || ''; + // Strip an optional UTF-8 BOM that some editors prepend on save. + const safe = (content || '').replace(/^\uFEFF/, ''); // For counting H1s, strip fenced code blocks so '# comment' lines inside // bash/etc. aren't matched. const stripped = safe.replace(/^([`~]{3,})[^\n]*\n[\s\S]*?^\1[^\n]*$/gm, ''); + // Normalize setext H1 (`Title\n====`) to ATX (`# Title`) so the rest of the + // checks apply uniformly. Only H1 ('='), not setext H2 ('-'). + const setextToAtx = (text) => + text.replace(/^(?!\s*$)([^\n]+)\n[ ]{0,3}=+[ \t]*$/gm, '# $1'); + const normStripped = setextToAtx(stripped); + const normSafe = setextToAtx(safe); + // CommonMark allows up to 3 spaces of indentation before an ATX heading. const h1Regex = /^[ ]{0,3}#\s+\S.*$/gm; - const h1s = stripped.match(h1Regex) || []; - // For position, check the original content: a code block at the top still - // counts as content before the H1. - const firstNonBlank = safe.split('\n').find((l) => l.trim() !== '') || ''; + const h1s = normStripped.match(h1Regex) || []; + // For position, check the original (BOM-stripped, setext-normalized) content. + const firstNonBlank = normSafe.split('\n').find((l) => l.trim() !== '') || ''; const startsWithH1 = /^[ ]{0,3}#\s+\S/.test(firstNonBlank); if (h1s.length === 0) { diff --git a/test/checkers/llms-txt.test.js b/test/checkers/llms-txt.test.js index b0bf3ef..5b7eaaa 100644 --- a/test/checkers/llms-txt.test.js +++ b/test/checkers/llms-txt.test.js @@ -42,4 +42,20 @@ describe('llms-txt checker', () => { ); assert.ok(positionWarning, 'Should warn that the H1 is not the first content in the file'); }); + + it('should accept a setext H1 (Title\\n=====) as a valid H1', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-setext'), projectDir: join(FIXTURES, 'llms-setext') }); + const missingH1 = result.findings.find( + (f) => f.severity === 'error' && /missing the required H1/.test(f.message) + ); + assert.equal(missingH1, undefined, 'Setext H1 should not trigger missing-H1 error'); + }); + + it('should not flag H1 position when file starts with a UTF-8 BOM', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-bom'), projectDir: join(FIXTURES, 'llms-bom') }); + const positionWarning = result.findings.find( + (f) => f.severity === 'warning' && /not the first content/.test(f.message) + ); + assert.equal(positionWarning, undefined, 'BOM prefix should not cause the position check to fail'); + }); }); diff --git a/test/fixtures/llms-bom/llms.txt b/test/fixtures/llms-bom/llms.txt new file mode 100644 index 0000000..1d0926f --- /dev/null +++ b/test/fixtures/llms-bom/llms.txt @@ -0,0 +1,7 @@ +# BomProject + +> A project whose llms.txt starts with a UTF-8 BOM. + +## Section + +- [Guide](https://example.com/guide): Intro (~200 tokens) diff --git a/test/fixtures/llms-setext/llms.txt b/test/fixtures/llms-setext/llms.txt new file mode 100644 index 0000000..d13b72b --- /dev/null +++ b/test/fixtures/llms-setext/llms.txt @@ -0,0 +1,8 @@ +MyProject +========= + +> A project using setext H1 syntax (title underlined with '='). + +## Section + +- [Guide](https://example.com/guide): Intro (~200 tokens)