diff --git a/docs/checkers.md b/docs/checkers.md index d8e67e6..c84e2dc 100644 --- a/docs/checkers.md +++ b/docs/checkers.md @@ -49,6 +49,8 @@ Checks for a well-formed `llms.txt` file, which serves as a structured index for - File itself is under 5,000 tokens (+1) - Organized with section headings (+1) +Also validates that the file has exactly one H1 heading placed first, as required by the spec. Violations are reported as findings (missing H1 → error; multiple H1s or H1 not first → warning). + **Specification:** https://llmstxt.org --- diff --git a/src/checkers/llms-txt.js b/src/checkers/llms-txt.js index fb9d2b2..19ab098 100644 --- a/src/checkers/llms-txt.js +++ b/src/checkers/llms-txt.js @@ -7,6 +7,46 @@ const NAME = 'llms.txt Discovery Index'; const CATEGORY = 'discovery'; const MAX_SCORE = 10; +/** + * Validate the H1 heading against the llms.txt spec. + * The spec requires exactly one H1 and places it first in the ordered structure. + */ +function validateH1(content) { + // Strip an optional UTF-8 BOM that some editors prepend on save. + const safe = (content || '').replace(/^\uFEFF/, ''); + // For counting H1s, strip fenced code blocks so '# comment' lines inside + // bash/etc. aren't matched. + const stripped = safe.replace(/^([`~]{3,})[^\n]*\n[\s\S]*?^\1[^\n]*$/gm, ''); + // Normalize setext H1 (`Title\n====`) to ATX (`# Title`) so the rest of the + // checks apply uniformly. Only H1 ('='), not setext H2 ('-'). + const setextToAtx = (text) => + text.replace(/^(?!\s*$)([^\n]+)\n[ ]{0,3}=+[ \t]*$/gm, '# $1'); + const normStripped = setextToAtx(stripped); + const normSafe = setextToAtx(safe); + + // CommonMark allows up to 3 spaces of indentation before an ATX heading. + const h1Regex = /^[ ]{0,3}#\s+\S.*$/gm; + const h1s = normStripped.match(h1Regex) || []; + // For position, check the original (BOM-stripped, setext-normalized) content. + const firstNonBlank = normSafe.split('\n').find((l) => l.trim() !== '') || ''; + const startsWithH1 = /^[ ]{0,3}#\s+\S/.test(firstNonBlank); + + if (h1s.length === 0) { + return finding('error', 'llms.txt is missing the required H1 heading.', + 'Add a top-level heading with your project or site name as the first line:\n# My Project'); + } + if (h1s.length > 1) { + return finding('warning', + `llms.txt contains ${h1s.length} H1 headings; the spec expects exactly one.`, + 'Keep a single top-level "# Project Name" heading. Use H2 (##) for sections.'); + } + if (!startsWithH1) { + return finding('warning', 'llms.txt H1 heading is not the first content in the file.', + 'Move the "# Project Name" heading to the top; the spec expects it before the blockquote summary and sections.'); + } + return finding('info', `H1 heading present: "${h1s[0].replace(/^\s*#\s+/, '').trim()}".`); +} + /** * Check llms.txt exists and is well-formed. * @@ -48,6 +88,10 @@ export async function check(context) { score += 3; findings.push(finding('info', `llms.txt found at ${foundPath}.`)); + // Spec: single H1 with the project/site name, first in the ordered structure. + // https://llmstxt.org + findings.push(validateH1(content)); + // Check for structured links [title](url) const linkPattern = /\[([^\]]+)\]\(([^)]+)\)/g; const links = [...content.matchAll(linkPattern)]; diff --git a/test/checkers/llms-txt.test.js b/test/checkers/llms-txt.test.js index 1ab5b7d..5b7eaaa 100644 --- a/test/checkers/llms-txt.test.js +++ b/test/checkers/llms-txt.test.js @@ -26,4 +26,36 @@ describe('llms-txt checker', () => { const withFix = result.findings.filter((f) => f.fix); assert.ok(withFix.length > 0, 'Should provide fix suggestions'); }); + + it('should report an error when llms.txt has no H1 heading', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-no-h1'), projectDir: join(FIXTURES, 'llms-no-h1') }); + const h1Error = result.findings.find( + (f) => f.severity === 'error' && /H1/.test(f.message) + ); + assert.ok(h1Error, 'Should report a missing-H1 error finding'); + }); + + it('should warn when the H1 is not the first content (code block above it)', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-h1-not-first'), projectDir: join(FIXTURES, 'llms-h1-not-first') }); + const positionWarning = result.findings.find( + (f) => f.severity === 'warning' && /not the first content/.test(f.message) + ); + assert.ok(positionWarning, 'Should warn that the H1 is not the first content in the file'); + }); + + it('should accept a setext H1 (Title\\n=====) as a valid H1', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-setext'), projectDir: join(FIXTURES, 'llms-setext') }); + const missingH1 = result.findings.find( + (f) => f.severity === 'error' && /missing the required H1/.test(f.message) + ); + assert.equal(missingH1, undefined, 'Setext H1 should not trigger missing-H1 error'); + }); + + it('should not flag H1 position when file starts with a UTF-8 BOM', async () => { + const result = await check({ dir: join(FIXTURES, 'llms-bom'), projectDir: join(FIXTURES, 'llms-bom') }); + const positionWarning = result.findings.find( + (f) => f.severity === 'warning' && /not the first content/.test(f.message) + ); + assert.equal(positionWarning, undefined, 'BOM prefix should not cause the position check to fail'); + }); }); diff --git a/test/fixtures/llms-bom/llms.txt b/test/fixtures/llms-bom/llms.txt new file mode 100644 index 0000000..1d0926f --- /dev/null +++ b/test/fixtures/llms-bom/llms.txt @@ -0,0 +1,7 @@ +# BomProject + +> A project whose llms.txt starts with a UTF-8 BOM. + +## Section + +- [Guide](https://example.com/guide): Intro (~200 tokens) diff --git a/test/fixtures/llms-h1-not-first/llms.txt b/test/fixtures/llms-h1-not-first/llms.txt new file mode 100644 index 0000000..4e6e1ab --- /dev/null +++ b/test/fixtures/llms-h1-not-first/llms.txt @@ -0,0 +1,12 @@ +```bash +# install +npm install example +``` + +# ExampleDocs + +> A project whose llms.txt starts with a code block before the H1. + +## Getting Started + +- [Quickstart](https://example.com/quickstart): First steps (~500 tokens) diff --git a/test/fixtures/llms-no-h1/llms.txt b/test/fixtures/llms-no-h1/llms.txt new file mode 100644 index 0000000..583f463 --- /dev/null +++ b/test/fixtures/llms-no-h1/llms.txt @@ -0,0 +1,9 @@ +> A site with an llms.txt that has links and sections but no H1. + +## Getting Started + +- [Quickstart](https://example.com/quickstart): Get going in 5 minutes (~500 tokens) + +## API Reference + +- [Users API](https://example.com/api/users): CRUD for user accounts (~800 tokens) diff --git a/test/fixtures/llms-setext/llms.txt b/test/fixtures/llms-setext/llms.txt new file mode 100644 index 0000000..d13b72b --- /dev/null +++ b/test/fixtures/llms-setext/llms.txt @@ -0,0 +1,8 @@ +MyProject +========= + +> A project using setext H1 syntax (title underlined with '='). + +## Section + +- [Guide](https://example.com/guide): Intro (~200 tokens)