Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions docs/checkers.md
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,8 @@ Checks for a well-formed `llms.txt` file, which serves as a structured index for
- File itself is under 5,000 tokens (+1)
- Organized with section headings (+1)

Also validates that the file has exactly one H1 heading placed first, as required by the spec. Violations are reported as findings (missing H1 → error; multiple H1s or H1 not first → warning).

**Specification:** https://llmstxt.org

---
Expand Down
44 changes: 44 additions & 0 deletions src/checkers/llms-txt.js
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,46 @@ const NAME = 'llms.txt Discovery Index';
const CATEGORY = 'discovery';
const MAX_SCORE = 10;

/**
* Validate the H1 heading against the llms.txt spec.
* The spec requires exactly one H1 and places it first in the ordered structure.
*/
function validateH1(content) {
// Strip an optional UTF-8 BOM that some editors prepend on save.
const safe = (content || '').replace(/^\uFEFF/, '');
// For counting H1s, strip fenced code blocks so '# comment' lines inside
// bash/etc. aren't matched.
const stripped = safe.replace(/^([`~]{3,})[^\n]*\n[\s\S]*?^\1[^\n]*$/gm, '');
// Normalize setext H1 (`Title\n====`) to ATX (`# Title`) so the rest of the
// checks apply uniformly. Only H1 ('='), not setext H2 ('-').
const setextToAtx = (text) =>
text.replace(/^(?!\s*$)([^\n]+)\n[ ]{0,3}=+[ \t]*$/gm, '# $1');
const normStripped = setextToAtx(stripped);
const normSafe = setextToAtx(safe);

// CommonMark allows up to 3 spaces of indentation before an ATX heading.
const h1Regex = /^[ ]{0,3}#\s+\S.*$/gm;
const h1s = normStripped.match(h1Regex) || [];
// For position, check the original (BOM-stripped, setext-normalized) content.
const firstNonBlank = normSafe.split('\n').find((l) => l.trim() !== '') || '';
const startsWithH1 = /^[ ]{0,3}#\s+\S/.test(firstNonBlank);

if (h1s.length === 0) {
return finding('error', 'llms.txt is missing the required H1 heading.',
'Add a top-level heading with your project or site name as the first line:\n# My Project');
}
if (h1s.length > 1) {
return finding('warning',
`llms.txt contains ${h1s.length} H1 headings; the spec expects exactly one.`,
'Keep a single top-level "# Project Name" heading. Use H2 (##) for sections.');
}
if (!startsWithH1) {
return finding('warning', 'llms.txt H1 heading is not the first content in the file.',
'Move the "# Project Name" heading to the top; the spec expects it before the blockquote summary and sections.');
}
return finding('info', `H1 heading present: "${h1s[0].replace(/^\s*#\s+/, '').trim()}".`);
}

/**
* Check llms.txt exists and is well-formed.
*
Expand Down Expand Up @@ -48,6 +88,10 @@ export async function check(context) {
score += 3;
findings.push(finding('info', `llms.txt found at ${foundPath}.`));

// Spec: single H1 with the project/site name, first in the ordered structure.
// https://llmstxt.org
findings.push(validateH1(content));

// Check for structured links [title](url)
const linkPattern = /\[([^\]]+)\]\(([^)]+)\)/g;
const links = [...content.matchAll(linkPattern)];
Expand Down
32 changes: 32 additions & 0 deletions test/checkers/llms-txt.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,4 +26,36 @@ describe('llms-txt checker', () => {
const withFix = result.findings.filter((f) => f.fix);
assert.ok(withFix.length > 0, 'Should provide fix suggestions');
});

it('should report an error when llms.txt has no H1 heading', async () => {
const result = await check({ dir: join(FIXTURES, 'llms-no-h1'), projectDir: join(FIXTURES, 'llms-no-h1') });
const h1Error = result.findings.find(
(f) => f.severity === 'error' && /H1/.test(f.message)
);
assert.ok(h1Error, 'Should report a missing-H1 error finding');
});

it('should warn when the H1 is not the first content (code block above it)', async () => {
const result = await check({ dir: join(FIXTURES, 'llms-h1-not-first'), projectDir: join(FIXTURES, 'llms-h1-not-first') });
const positionWarning = result.findings.find(
(f) => f.severity === 'warning' && /not the first content/.test(f.message)
);
assert.ok(positionWarning, 'Should warn that the H1 is not the first content in the file');
});

it('should accept a setext H1 (Title\\n=====) as a valid H1', async () => {
const result = await check({ dir: join(FIXTURES, 'llms-setext'), projectDir: join(FIXTURES, 'llms-setext') });
const missingH1 = result.findings.find(
(f) => f.severity === 'error' && /missing the required H1/.test(f.message)
);
assert.equal(missingH1, undefined, 'Setext H1 should not trigger missing-H1 error');
});

it('should not flag H1 position when file starts with a UTF-8 BOM', async () => {
const result = await check({ dir: join(FIXTURES, 'llms-bom'), projectDir: join(FIXTURES, 'llms-bom') });
const positionWarning = result.findings.find(
(f) => f.severity === 'warning' && /not the first content/.test(f.message)
);
assert.equal(positionWarning, undefined, 'BOM prefix should not cause the position check to fail');
});
});
7 changes: 7 additions & 0 deletions test/fixtures/llms-bom/llms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# BomProject

> A project whose llms.txt starts with a UTF-8 BOM.

## Section

- [Guide](https://example.com/guide): Intro (~200 tokens)
12 changes: 12 additions & 0 deletions test/fixtures/llms-h1-not-first/llms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
```bash
# install
npm install example
```

# ExampleDocs

> A project whose llms.txt starts with a code block before the H1.

## Getting Started

- [Quickstart](https://example.com/quickstart): First steps (~500 tokens)
9 changes: 9 additions & 0 deletions test/fixtures/llms-no-h1/llms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
> A site with an llms.txt that has links and sections but no H1.

## Getting Started

- [Quickstart](https://example.com/quickstart): Get going in 5 minutes (~500 tokens)

## API Reference

- [Users API](https://example.com/api/users): CRUD for user accounts (~800 tokens)
8 changes: 8 additions & 0 deletions test/fixtures/llms-setext/llms.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
MyProject
=========

> A project using setext H1 syntax (title underlined with '=').

## Section

- [Guide](https://example.com/guide): Intro (~200 tokens)