Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 6 additions & 2 deletions src/checkers/robots-txt.js
Original file line number Diff line number Diff line change
Expand Up @@ -60,9 +60,13 @@ export async function check(context) {
if (isAgentBlocked(rules, crawler.name)) {
blockedCrawlers.push(crawler.name);
} else {
// Check if explicitly allowed
// Any non-empty Allow rule for this agent counts as an explicit
// allowance (e.g. `Allow: /`, `Allow: /*.md$`, `Allow: /docs/`).
const hasExplicitAllow = rules.some(
(r) => r.agent.toLowerCase() === crawler.name.toLowerCase() && r.allow === '/'
(r) =>
r.agent.toLowerCase() === crawler.name.toLowerCase() &&
typeof r.allow === 'string' &&
r.allow !== ''
);
if (hasExplicitAllow) {
allowedCrawlers.push(crawler.name);
Expand Down
30 changes: 23 additions & 7 deletions src/utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -163,24 +163,40 @@ export const AI_AGENTS = {

/**
* Parse a simple robots.txt into structured rules.
*
* Handles multi-agent groups per RFC 9309 (§2.1, §2.2): consecutive User-agent
* lines before any rule share that block of rules; a User-agent line that
* follows a rule starts a new group.
*/
export function parseRobotsTxt(content) {
const rules = [];
let currentAgent = null;
let currentAgents = [];
let lastDirectiveWasRule = false;

for (const line of content.split('\n')) {
const trimmed = line.trim();
if (!trimmed || trimmed.startsWith('#')) continue;

const [directive, ...rest] = trimmed.split(':');
const value = rest.join(':').trim();
const directiveLower = directive.toLowerCase();

if (directive.toLowerCase() === 'user-agent') {
currentAgent = value;
} else if (directive.toLowerCase() === 'disallow' && currentAgent) {
rules.push({ agent: currentAgent, disallow: value });
} else if (directive.toLowerCase() === 'allow' && currentAgent) {
rules.push({ agent: currentAgent, allow: value });
if (directiveLower === 'user-agent') {
if (lastDirectiveWasRule) {
currentAgents = [];
lastDirectiveWasRule = false;
}
currentAgents.push(value);
} else if (directiveLower === 'disallow' && currentAgents.length > 0) {
for (const agent of currentAgents) {
rules.push({ agent, disallow: value });
}
lastDirectiveWasRule = true;
} else if (directiveLower === 'allow' && currentAgents.length > 0) {
for (const agent of currentAgents) {
rules.push({ agent, allow: value });
}
lastDirectiveWasRule = true;
}
}

Expand Down
49 changes: 49 additions & 0 deletions test/checkers/robots-txt.test.js
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import { describe, it } from 'node:test';
import assert from 'node:assert/strict';
import { check } from '../../src/checkers/robots-txt.js';
import { parseRobotsTxt } from '../../src/utils.js';
import { join } from 'node:path';

const FIXTURES = join(import.meta.dirname, '..', 'fixtures');
Expand Down Expand Up @@ -37,4 +38,52 @@ describe('robots-txt checker', () => {
assert.ok(Array.isArray(result.findings));
assert.ok(['pass', 'warn', 'fail', 'error'].includes(result.status));
});

it('should recognize path-specific Allow rules on a stacked user-agent group', async () => {
const result = await check({
dir: join(FIXTURES, 'nvidia-style-site'),
projectDir: join(FIXTURES, 'nvidia-style-site'),
});
const notAllowedWarning = result.findings.find(
(f) => f.severity === 'warning' && /No AI crawlers are explicitly allowed/.test(f.message)
);
assert.equal(notAllowedWarning, undefined, 'Should not warn when stacked group has path-specific Allow rules');
const allowedInfo = result.findings.find(
(f) => f.severity === 'info' && /explicitly allowed/.test(f.message)
);
assert.ok(allowedInfo, 'Should report the allowed crawlers as an info finding');
});
});

describe('parseRobotsTxt', () => {
it('should attribute rules to every agent in a stacked group (RFC 9309 §2.2)', () => {
const content = [
'User-agent: ClaudeBot',
'User-agent: GPTBot',
'Allow: /*.md$',
].join('\n');
const rules = parseRobotsTxt(content);
const claudeAllows = rules.filter((r) => r.agent === 'ClaudeBot' && r.allow === '/*.md$');
const gptAllows = rules.filter((r) => r.agent === 'GPTBot' && r.allow === '/*.md$');
assert.equal(claudeAllows.length, 1, 'ClaudeBot should inherit the group Allow rule');
assert.equal(gptAllows.length, 1, 'GPTBot should inherit the group Allow rule');
});

it('should start a new group when a user-agent line follows a rule line', () => {
const content = [
'User-agent: ClaudeBot',
'Allow: /a',
'User-agent: GPTBot',
'Allow: /b',
].join('\n');
const rules = parseRobotsTxt(content);
assert.deepEqual(
rules.filter((r) => r.agent === 'ClaudeBot').map((r) => r.allow),
['/a']
);
assert.deepEqual(
rules.filter((r) => r.agent === 'GPTBot').map((r) => r.allow),
['/b']
);
});
});
12 changes: 12 additions & 0 deletions test/fixtures/nvidia-style-site/robots.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# Stacked user-agent group followed by path-specific allows,
# modeled on docs.nvidia.com/robots.txt.

User-agent: ClaudeBot
User-agent: GPTBot
User-agent: PerplexityBot
User-agent: Google-Extended
Allow: /*.llms.txt$
Allow: /*.md$

User-agent: *
Disallow: /*?utm*