From 0a4c32640491433835fe6587a6ae2269f842a112 Mon Sep 17 00:00:00 2001 From: Federico Bartoli Date: Thu, 16 Apr 2026 09:02:42 +0200 Subject: [PATCH 1/2] Parse consecutive user-agent lines as a single RFC 9309 group MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit parseRobotsTxt previously overwrote currentAgent on every User-agent line, so robots.txt files that stack multiple user-agents before a shared block of rules (e.g. docs.nvidia.com/robots.txt) attributed those rules only to the last agent in the stack. RFC 9309 §2.1 defines a group as "one or more user-agent lines followed by one or more rules"; the ABNF grammar in §2.2 formalises this with startgroupline repeating before rules. Accumulate agents in currentAgents and reset the group only when a User-agent line follows a rule line. --- src/utils.js | 30 +++++++++++++++++++++++------- 1 file changed, 23 insertions(+), 7 deletions(-) diff --git a/src/utils.js b/src/utils.js index f60fd54..d4804f5 100644 --- a/src/utils.js +++ b/src/utils.js @@ -163,10 +163,15 @@ export const AI_AGENTS = { /** * Parse a simple robots.txt into structured rules. + * + * Handles multi-agent groups per RFC 9309 (§2.1, §2.2): consecutive User-agent + * lines before any rule share that block of rules; a User-agent line that + * follows a rule starts a new group. */ export function parseRobotsTxt(content) { const rules = []; - let currentAgent = null; + let currentAgents = []; + let lastDirectiveWasRule = false; for (const line of content.split('\n')) { const trimmed = line.trim(); @@ -174,13 +179,24 @@ export function parseRobotsTxt(content) { const [directive, ...rest] = trimmed.split(':'); const value = rest.join(':').trim(); + const directiveLower = directive.toLowerCase(); - if (directive.toLowerCase() === 'user-agent') { - currentAgent = value; - } else if (directive.toLowerCase() === 'disallow' && currentAgent) { - rules.push({ agent: currentAgent, disallow: value }); - } else if (directive.toLowerCase() === 'allow' && currentAgent) { - rules.push({ agent: currentAgent, allow: value }); + if (directiveLower === 'user-agent') { + if (lastDirectiveWasRule) { + currentAgents = []; + lastDirectiveWasRule = false; + } + currentAgents.push(value); + } else if (directiveLower === 'disallow' && currentAgents.length > 0) { + for (const agent of currentAgents) { + rules.push({ agent, disallow: value }); + } + lastDirectiveWasRule = true; + } else if (directiveLower === 'allow' && currentAgents.length > 0) { + for (const agent of currentAgents) { + rules.push({ agent, allow: value }); + } + lastDirectiveWasRule = true; } } From bddf6cbfde37e50e3b4a928f0e6b6040d905ff5f Mon Sep 17 00:00:00 2001 From: Federico Bartoli Date: Thu, 16 Apr 2026 09:03:01 +0200 Subject: [PATCH 2/2] Recognize path-specific Allow rules as explicit allowances The robots-txt checker only treated literal `Allow: /` as an explicit allowance. Sites that welcome AI crawlers with more specific rules (e.g. `Allow: /*.llms.txt$`) were still reported as having no explicitly allowed crawlers. Treat any non-empty Allow rule for a known AI crawler as an explicit allowance. Combined with the preceding parser fix, this closes the false positive reported in #5. Adds a regression fixture and tests modeled on docs.nvidia.com. Fixes #5 --- src/checkers/robots-txt.js | 8 +++- test/checkers/robots-txt.test.js | 49 ++++++++++++++++++++++ test/fixtures/nvidia-style-site/robots.txt | 12 ++++++ 3 files changed, 67 insertions(+), 2 deletions(-) create mode 100644 test/fixtures/nvidia-style-site/robots.txt diff --git a/src/checkers/robots-txt.js b/src/checkers/robots-txt.js index 0333a1d..cd9122d 100644 --- a/src/checkers/robots-txt.js +++ b/src/checkers/robots-txt.js @@ -60,9 +60,13 @@ export async function check(context) { if (isAgentBlocked(rules, crawler.name)) { blockedCrawlers.push(crawler.name); } else { - // Check if explicitly allowed + // Any non-empty Allow rule for this agent counts as an explicit + // allowance (e.g. `Allow: /`, `Allow: /*.md$`, `Allow: /docs/`). const hasExplicitAllow = rules.some( - (r) => r.agent.toLowerCase() === crawler.name.toLowerCase() && r.allow === '/' + (r) => + r.agent.toLowerCase() === crawler.name.toLowerCase() && + typeof r.allow === 'string' && + r.allow !== '' ); if (hasExplicitAllow) { allowedCrawlers.push(crawler.name); diff --git a/test/checkers/robots-txt.test.js b/test/checkers/robots-txt.test.js index 1acfbf1..96eeda0 100644 --- a/test/checkers/robots-txt.test.js +++ b/test/checkers/robots-txt.test.js @@ -1,6 +1,7 @@ import { describe, it } from 'node:test'; import assert from 'node:assert/strict'; import { check } from '../../src/checkers/robots-txt.js'; +import { parseRobotsTxt } from '../../src/utils.js'; import { join } from 'node:path'; const FIXTURES = join(import.meta.dirname, '..', 'fixtures'); @@ -37,4 +38,52 @@ describe('robots-txt checker', () => { assert.ok(Array.isArray(result.findings)); assert.ok(['pass', 'warn', 'fail', 'error'].includes(result.status)); }); + + it('should recognize path-specific Allow rules on a stacked user-agent group', async () => { + const result = await check({ + dir: join(FIXTURES, 'nvidia-style-site'), + projectDir: join(FIXTURES, 'nvidia-style-site'), + }); + const notAllowedWarning = result.findings.find( + (f) => f.severity === 'warning' && /No AI crawlers are explicitly allowed/.test(f.message) + ); + assert.equal(notAllowedWarning, undefined, 'Should not warn when stacked group has path-specific Allow rules'); + const allowedInfo = result.findings.find( + (f) => f.severity === 'info' && /explicitly allowed/.test(f.message) + ); + assert.ok(allowedInfo, 'Should report the allowed crawlers as an info finding'); + }); +}); + +describe('parseRobotsTxt', () => { + it('should attribute rules to every agent in a stacked group (RFC 9309 §2.2)', () => { + const content = [ + 'User-agent: ClaudeBot', + 'User-agent: GPTBot', + 'Allow: /*.md$', + ].join('\n'); + const rules = parseRobotsTxt(content); + const claudeAllows = rules.filter((r) => r.agent === 'ClaudeBot' && r.allow === '/*.md$'); + const gptAllows = rules.filter((r) => r.agent === 'GPTBot' && r.allow === '/*.md$'); + assert.equal(claudeAllows.length, 1, 'ClaudeBot should inherit the group Allow rule'); + assert.equal(gptAllows.length, 1, 'GPTBot should inherit the group Allow rule'); + }); + + it('should start a new group when a user-agent line follows a rule line', () => { + const content = [ + 'User-agent: ClaudeBot', + 'Allow: /a', + 'User-agent: GPTBot', + 'Allow: /b', + ].join('\n'); + const rules = parseRobotsTxt(content); + assert.deepEqual( + rules.filter((r) => r.agent === 'ClaudeBot').map((r) => r.allow), + ['/a'] + ); + assert.deepEqual( + rules.filter((r) => r.agent === 'GPTBot').map((r) => r.allow), + ['/b'] + ); + }); }); diff --git a/test/fixtures/nvidia-style-site/robots.txt b/test/fixtures/nvidia-style-site/robots.txt new file mode 100644 index 0000000..7101268 --- /dev/null +++ b/test/fixtures/nvidia-style-site/robots.txt @@ -0,0 +1,12 @@ +# Stacked user-agent group followed by path-specific allows, +# modeled on docs.nvidia.com/robots.txt. + +User-agent: ClaudeBot +User-agent: GPTBot +User-agent: PerplexityBot +User-agent: Google-Extended +Allow: /*.llms.txt$ +Allow: /*.md$ + +User-agent: * +Disallow: /*?utm*