From 0a4c32640491433835fe6587a6ae2269f842a112 Mon Sep 17 00:00:00 2001
From: Federico Bartoli <federico.bartoli@madisoft.it>
Date: Thu, 16 Apr 2026 09:02:42 +0200
Subject: [PATCH 1/2] Parse consecutive user-agent lines as a single RFC 9309
 group
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

parseRobotsTxt previously overwrote currentAgent on every User-agent
line, so robots.txt files that stack multiple user-agents before a
shared block of rules (e.g. docs.nvidia.com/robots.txt) attributed
those rules only to the last agent in the stack.

RFC 9309 §2.1 defines a group as "one or more user-agent lines
followed by one or more rules"; the ABNF grammar in §2.2 formalises
this with startgroupline repeating before rules.

Accumulate agents in currentAgents and reset the group only when a
User-agent line follows a rule line.
---
 src/utils.js | 30 +++++++++++++++++++++++-------
 1 file changed, 23 insertions(+), 7 deletions(-)

diff --git a/src/utils.js b/src/utils.js
index f60fd54..d4804f5 100644
--- a/src/utils.js
+++ b/src/utils.js
@@ -163,10 +163,15 @@ export const AI_AGENTS = {
 
 /**
  * Parse a simple robots.txt into structured rules.
+ *
+ * Handles multi-agent groups per RFC 9309 (§2.1, §2.2): consecutive User-agent
+ * lines before any rule share that block of rules; a User-agent line that
+ * follows a rule starts a new group.
  */
 export function parseRobotsTxt(content) {
   const rules = [];
-  let currentAgent = null;
+  let currentAgents = [];
+  let lastDirectiveWasRule = false;
 
   for (const line of content.split('\n')) {
     const trimmed = line.trim();
@@ -174,13 +179,24 @@ export function parseRobotsTxt(content) {
 
     const [directive, ...rest] = trimmed.split(':');
     const value = rest.join(':').trim();
+    const directiveLower = directive.toLowerCase();
 
-    if (directive.toLowerCase() === 'user-agent') {
-      currentAgent = value;
-    } else if (directive.toLowerCase() === 'disallow' && currentAgent) {
-      rules.push({ agent: currentAgent, disallow: value });
-    } else if (directive.toLowerCase() === 'allow' && currentAgent) {
-      rules.push({ agent: currentAgent, allow: value });
+    if (directiveLower === 'user-agent') {
+      if (lastDirectiveWasRule) {
+        currentAgents = [];
+        lastDirectiveWasRule = false;
+      }
+      currentAgents.push(value);
+    } else if (directiveLower === 'disallow' && currentAgents.length > 0) {
+      for (const agent of currentAgents) {
+        rules.push({ agent, disallow: value });
+      }
+      lastDirectiveWasRule = true;
+    } else if (directiveLower === 'allow' && currentAgents.length > 0) {
+      for (const agent of currentAgents) {
+        rules.push({ agent, allow: value });
+      }
+      lastDirectiveWasRule = true;
     }
   }
 

From bddf6cbfde37e50e3b4a928f0e6b6040d905ff5f Mon Sep 17 00:00:00 2001
From: Federico Bartoli <federico.bartoli@madisoft.it>
Date: Thu, 16 Apr 2026 09:03:01 +0200
Subject: [PATCH 2/2] Recognize path-specific Allow rules as explicit
 allowances

The robots-txt checker only treated literal `Allow: /` as an explicit
allowance. Sites that welcome AI crawlers with more specific rules
(e.g. `Allow: /*.llms.txt$`) were still reported as having no
explicitly allowed crawlers.

Treat any non-empty Allow rule for a known AI crawler as an explicit
allowance. Combined with the preceding parser fix, this closes the
false positive reported in #5.

Adds a regression fixture and tests modeled on docs.nvidia.com.

Fixes #5
---
 src/checkers/robots-txt.js                 |  8 +++-
 test/checkers/robots-txt.test.js           | 49 ++++++++++++++++++++++
 test/fixtures/nvidia-style-site/robots.txt | 12 ++++++
 3 files changed, 67 insertions(+), 2 deletions(-)
 create mode 100644 test/fixtures/nvidia-style-site/robots.txt

diff --git a/src/checkers/robots-txt.js b/src/checkers/robots-txt.js
index 0333a1d..cd9122d 100644
--- a/src/checkers/robots-txt.js
+++ b/src/checkers/robots-txt.js
@@ -60,9 +60,13 @@ export async function check(context) {
     if (isAgentBlocked(rules, crawler.name)) {
       blockedCrawlers.push(crawler.name);
     } else {
-      // Check if explicitly allowed
+      // Any non-empty Allow rule for this agent counts as an explicit
+      // allowance (e.g. `Allow: /`, `Allow: /*.md$`, `Allow: /docs/`).
       const hasExplicitAllow = rules.some(
-        (r) => r.agent.toLowerCase() === crawler.name.toLowerCase() && r.allow === '/'
+        (r) =>
+          r.agent.toLowerCase() === crawler.name.toLowerCase() &&
+          typeof r.allow === 'string' &&
+          r.allow !== ''
       );
       if (hasExplicitAllow) {
         allowedCrawlers.push(crawler.name);
diff --git a/test/checkers/robots-txt.test.js b/test/checkers/robots-txt.test.js
index 1acfbf1..96eeda0 100644
--- a/test/checkers/robots-txt.test.js
+++ b/test/checkers/robots-txt.test.js
@@ -1,6 +1,7 @@
 import { describe, it } from 'node:test';
 import assert from 'node:assert/strict';
 import { check } from '../../src/checkers/robots-txt.js';
+import { parseRobotsTxt } from '../../src/utils.js';
 import { join } from 'node:path';
 
 const FIXTURES = join(import.meta.dirname, '..', 'fixtures');
@@ -37,4 +38,52 @@ describe('robots-txt checker', () => {
     assert.ok(Array.isArray(result.findings));
     assert.ok(['pass', 'warn', 'fail', 'error'].includes(result.status));
   });
+
+  it('should recognize path-specific Allow rules on a stacked user-agent group', async () => {
+    const result = await check({
+      dir: join(FIXTURES, 'nvidia-style-site'),
+      projectDir: join(FIXTURES, 'nvidia-style-site'),
+    });
+    const notAllowedWarning = result.findings.find(
+      (f) => f.severity === 'warning' && /No AI crawlers are explicitly allowed/.test(f.message)
+    );
+    assert.equal(notAllowedWarning, undefined, 'Should not warn when stacked group has path-specific Allow rules');
+    const allowedInfo = result.findings.find(
+      (f) => f.severity === 'info' && /explicitly allowed/.test(f.message)
+    );
+    assert.ok(allowedInfo, 'Should report the allowed crawlers as an info finding');
+  });
+});
+
+describe('parseRobotsTxt', () => {
+  it('should attribute rules to every agent in a stacked group (RFC 9309 §2.2)', () => {
+    const content = [
+      'User-agent: ClaudeBot',
+      'User-agent: GPTBot',
+      'Allow: /*.md$',
+    ].join('\n');
+    const rules = parseRobotsTxt(content);
+    const claudeAllows = rules.filter((r) => r.agent === 'ClaudeBot' && r.allow === '/*.md$');
+    const gptAllows = rules.filter((r) => r.agent === 'GPTBot' && r.allow === '/*.md$');
+    assert.equal(claudeAllows.length, 1, 'ClaudeBot should inherit the group Allow rule');
+    assert.equal(gptAllows.length, 1, 'GPTBot should inherit the group Allow rule');
+  });
+
+  it('should start a new group when a user-agent line follows a rule line', () => {
+    const content = [
+      'User-agent: ClaudeBot',
+      'Allow: /a',
+      'User-agent: GPTBot',
+      'Allow: /b',
+    ].join('\n');
+    const rules = parseRobotsTxt(content);
+    assert.deepEqual(
+      rules.filter((r) => r.agent === 'ClaudeBot').map((r) => r.allow),
+      ['/a']
+    );
+    assert.deepEqual(
+      rules.filter((r) => r.agent === 'GPTBot').map((r) => r.allow),
+      ['/b']
+    );
+  });
 });
diff --git a/test/fixtures/nvidia-style-site/robots.txt b/test/fixtures/nvidia-style-site/robots.txt
new file mode 100644
index 0000000..7101268
--- /dev/null
+++ b/test/fixtures/nvidia-style-site/robots.txt
@@ -0,0 +1,12 @@
+# Stacked user-agent group followed by path-specific allows,
+# modeled on docs.nvidia.com/robots.txt.
+
+User-agent: ClaudeBot
+User-agent: GPTBot
+User-agent: PerplexityBot
+User-agent: Google-Extended
+Allow: /*.llms.txt$
+Allow: /*.md$
+
+User-agent: *
+Disallow: /*?utm*