diff --git a/src/checkers/robots-txt.js b/src/checkers/robots-txt.js index 0333a1d..cd9122d 100644 --- a/src/checkers/robots-txt.js +++ b/src/checkers/robots-txt.js @@ -60,9 +60,13 @@ export async function check(context) { if (isAgentBlocked(rules, crawler.name)) { blockedCrawlers.push(crawler.name); } else { - // Check if explicitly allowed + // Any non-empty Allow rule for this agent counts as an explicit + // allowance (e.g. `Allow: /`, `Allow: /*.md$`, `Allow: /docs/`). const hasExplicitAllow = rules.some( - (r) => r.agent.toLowerCase() === crawler.name.toLowerCase() && r.allow === '/' + (r) => + r.agent.toLowerCase() === crawler.name.toLowerCase() && + typeof r.allow === 'string' && + r.allow !== '' ); if (hasExplicitAllow) { allowedCrawlers.push(crawler.name); diff --git a/src/utils.js b/src/utils.js index f60fd54..d4804f5 100644 --- a/src/utils.js +++ b/src/utils.js @@ -163,10 +163,15 @@ export const AI_AGENTS = { /** * Parse a simple robots.txt into structured rules. + * + * Handles multi-agent groups per RFC 9309 (§2.1, §2.2): consecutive User-agent + * lines before any rule share that block of rules; a User-agent line that + * follows a rule starts a new group. */ export function parseRobotsTxt(content) { const rules = []; - let currentAgent = null; + let currentAgents = []; + let lastDirectiveWasRule = false; for (const line of content.split('\n')) { const trimmed = line.trim(); @@ -174,13 +179,24 @@ export function parseRobotsTxt(content) { const [directive, ...rest] = trimmed.split(':'); const value = rest.join(':').trim(); + const directiveLower = directive.toLowerCase(); - if (directive.toLowerCase() === 'user-agent') { - currentAgent = value; - } else if (directive.toLowerCase() === 'disallow' && currentAgent) { - rules.push({ agent: currentAgent, disallow: value }); - } else if (directive.toLowerCase() === 'allow' && currentAgent) { - rules.push({ agent: currentAgent, allow: value }); + if (directiveLower === 'user-agent') { + if (lastDirectiveWasRule) { + currentAgents = []; + lastDirectiveWasRule = false; + } + currentAgents.push(value); + } else if (directiveLower === 'disallow' && currentAgents.length > 0) { + for (const agent of currentAgents) { + rules.push({ agent, disallow: value }); + } + lastDirectiveWasRule = true; + } else if (directiveLower === 'allow' && currentAgents.length > 0) { + for (const agent of currentAgents) { + rules.push({ agent, allow: value }); + } + lastDirectiveWasRule = true; } } diff --git a/test/checkers/robots-txt.test.js b/test/checkers/robots-txt.test.js index 1acfbf1..96eeda0 100644 --- a/test/checkers/robots-txt.test.js +++ b/test/checkers/robots-txt.test.js @@ -1,6 +1,7 @@ import { describe, it } from 'node:test'; import assert from 'node:assert/strict'; import { check } from '../../src/checkers/robots-txt.js'; +import { parseRobotsTxt } from '../../src/utils.js'; import { join } from 'node:path'; const FIXTURES = join(import.meta.dirname, '..', 'fixtures'); @@ -37,4 +38,52 @@ describe('robots-txt checker', () => { assert.ok(Array.isArray(result.findings)); assert.ok(['pass', 'warn', 'fail', 'error'].includes(result.status)); }); + + it('should recognize path-specific Allow rules on a stacked user-agent group', async () => { + const result = await check({ + dir: join(FIXTURES, 'nvidia-style-site'), + projectDir: join(FIXTURES, 'nvidia-style-site'), + }); + const notAllowedWarning = result.findings.find( + (f) => f.severity === 'warning' && /No AI crawlers are explicitly allowed/.test(f.message) + ); + assert.equal(notAllowedWarning, undefined, 'Should not warn when stacked group has path-specific Allow rules'); + const allowedInfo = result.findings.find( + (f) => f.severity === 'info' && /explicitly allowed/.test(f.message) + ); + assert.ok(allowedInfo, 'Should report the allowed crawlers as an info finding'); + }); +}); + +describe('parseRobotsTxt', () => { + it('should attribute rules to every agent in a stacked group (RFC 9309 §2.2)', () => { + const content = [ + 'User-agent: ClaudeBot', + 'User-agent: GPTBot', + 'Allow: /*.md$', + ].join('\n'); + const rules = parseRobotsTxt(content); + const claudeAllows = rules.filter((r) => r.agent === 'ClaudeBot' && r.allow === '/*.md$'); + const gptAllows = rules.filter((r) => r.agent === 'GPTBot' && r.allow === '/*.md$'); + assert.equal(claudeAllows.length, 1, 'ClaudeBot should inherit the group Allow rule'); + assert.equal(gptAllows.length, 1, 'GPTBot should inherit the group Allow rule'); + }); + + it('should start a new group when a user-agent line follows a rule line', () => { + const content = [ + 'User-agent: ClaudeBot', + 'Allow: /a', + 'User-agent: GPTBot', + 'Allow: /b', + ].join('\n'); + const rules = parseRobotsTxt(content); + assert.deepEqual( + rules.filter((r) => r.agent === 'ClaudeBot').map((r) => r.allow), + ['/a'] + ); + assert.deepEqual( + rules.filter((r) => r.agent === 'GPTBot').map((r) => r.allow), + ['/b'] + ); + }); }); diff --git a/test/fixtures/nvidia-style-site/robots.txt b/test/fixtures/nvidia-style-site/robots.txt new file mode 100644 index 0000000..7101268 --- /dev/null +++ b/test/fixtures/nvidia-style-site/robots.txt @@ -0,0 +1,12 @@ +# Stacked user-agent group followed by path-specific allows, +# modeled on docs.nvidia.com/robots.txt. + +User-agent: ClaudeBot +User-agent: GPTBot +User-agent: PerplexityBot +User-agent: Google-Extended +Allow: /*.llms.txt$ +Allow: /*.md$ + +User-agent: * +Disallow: /*?utm*