From 84864dd67c1dbc521cc270b7d131e6c4051fc811 Mon Sep 17 00:00:00 2001
From: Werner Bisschoff <werner@bisschoff.dev>
Date: Sat, 20 Jun 2026 16:06:14 +0200
Subject: [PATCH] fix(context): prevent OOM crash when building packages from
 large llms-full.txt files

Large markdown files (>1MB) like Cloudflare's llms-full.txt previously
caused Node.js heap OOM because remark-parse built a full AST of the
entire document. Now they are pre-split by ## headings so each chunk is
independently parseable with minimal memory.

Fixes: #99
---
 .changeset/thin-ligers-walk.md               |   5 +
 packages/context/src/package-builder.test.ts | 119 ++++++++++++++++++-
 packages/context/src/package-builder.ts      |  44 ++++++-
 3 files changed, 166 insertions(+), 2 deletions(-)
 create mode 100644 .changeset/thin-ligers-walk.md

diff --git a/.changeset/thin-ligers-walk.md b/.changeset/thin-ligers-walk.md
new file mode 100644
index 0000000..618f855
--- /dev/null
+++ b/.changeset/thin-ligers-walk.md
@@ -0,0 +1,5 @@
+---
+"@neuledge/context": patch
+---
+
+Fix OOM crash when building packages from large llms-full.txt files (e.g., Cloudflare docs). Large markdown files (>1MB) are now pre-split by `##` headings before AST parsing so individual chunks stay small.
diff --git a/packages/context/src/package-builder.test.ts b/packages/context/src/package-builder.test.ts
index a3abb03..83dfd5c 100644
--- a/packages/context/src/package-builder.test.ts
+++ b/packages/context/src/package-builder.test.ts
@@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
 import { join } from "node:path";
 import { afterEach, beforeAll, describe, expect, it } from "vitest";
 import { initDatabase, openDatabase } from "./database.js";
-import { buildPackage } from "./package-builder.js";
+import { buildPackage, splitMarkdownByHeadings } from "./package-builder.js";
 
 describe("buildPackage", () => {
   beforeAll(async () => {
@@ -309,3 +309,120 @@ Run the install command.
     }
   });
 });
+
+describe("splitMarkdownByHeadings", () => {
+  it("splits content by ## headings into preamble + per-section parts", () => {
+    const file = {
+      path: "test.txt",
+      content: `# Docs
+
+Intro text.
+
+## Workers
+
+Workers content.
+
+## Pages
+
+Pages content.`,
+    };
+    const result = splitMarkdownByHeadings(file);
+    // preamble + 2 sections = 3 parts
+    expect(result).toHaveLength(3);
+    expect(result[0]?.content).toContain("Intro text.");
+    expect(result[0]?.content).not.toContain("## Workers");
+    expect(result[1]?.content).toContain("## Workers");
+    expect(result[1]?.content).toContain("Workers content.");
+    expect(result[2]?.content).toContain("## Pages");
+    expect(result[2]?.content).toContain("Pages content.");
+  });
+
+  it("handles content starting with ## (no preamble)", () => {
+    const file = {
+      path: "test.txt",
+      content: `## Alpha
+
+Alpha content.
+
+## Beta
+
+Beta content.`,
+    };
+    const result = splitMarkdownByHeadings(file);
+    expect(result).toHaveLength(2);
+    expect(result[0]?.content).toMatch(/^## Alpha/);
+    expect(result[0]?.content).toContain("Alpha content.");
+    expect(result[1]?.content).toMatch(/^## Beta/);
+    expect(result[1]?.content).toContain("Beta content.");
+  });
+
+  it("returns original file when no ## headings exist", () => {
+    const file = {
+      path: "readme.md",
+      content: "# Title\n\nJust a single section.\n\n### Subheading\n\nMore content.",
+    };
+    const result = splitMarkdownByHeadings(file);
+    expect(result).toEqual([file]);
+  });
+
+  it("returns original file when content starts with a single ## section (no split needed)", () => {
+    const file = {
+      path: "single.txt",
+      content: "## Only One Section\n\nContent here.",
+    };
+    const result = splitMarkdownByHeadings(file);
+    expect(result).toEqual([file]);
+  });
+
+  it("preserves the preamble before the first ## heading", () => {
+    const file = {
+      path: "docs.md",
+      content: `---
+title: Docs
+---
+
+# Title
+
+Intro paragraph.
+
+## First Section
+
+Content here.`,
+    };
+    const result = splitMarkdownByHeadings(file);
+    expect(result).toHaveLength(2);
+    expect(result[0]?.content).toContain("Intro paragraph.");
+    expect(result[1]?.content).toContain("## First Section");
+    expect(result[1]?.content).toContain("Content here.");
+  });
+
+  it("preserves empty lines within sections", () => {
+    const file = {
+      path: "spacing.txt",
+      content: `## Section A
+
+Line 1.
+
+Line 2.
+
+## Section B
+
+Line 3.`,
+    };
+    const result = splitMarkdownByHeadings(file);
+    expect(result).toHaveLength(2);
+    expect(result[0]?.content).toContain("Line 1.\n\nLine 2.");
+    expect(result[1]?.content).toContain("Line 3.");
+  });
+
+  it("preserves doc_path across all split parts", () => {
+    const file = {
+      path: "cloudflare.com/llms-full.txt",
+      content: `## Workers\nContent.\n\n## Pages\nContent.`,
+    };
+    const result = splitMarkdownByHeadings(file);
+    for (const part of result) {
+      expect(part.path).toBe("cloudflare.com/llms-full.txt");
+    }
+  });
+});
diff --git a/packages/context/src/package-builder.ts b/packages/context/src/package-builder.ts
index 135ec61..1ea9795 100644
--- a/packages/context/src/package-builder.ts
+++ b/packages/context/src/package-builder.ts
@@ -35,6 +35,37 @@ export interface BuildResult {
   totalTokens: number;
 }
 
+/** Files larger than this are pre-split by ## headings before AST parsing to avoid OOM. */
+const MAX_FILE_SIZE_FOR_PARSING = 1024 * 1024; // 1MB
+
+/** Pre-split oversized markdown by ## headings into independently parseable chunks. */
+export function splitMarkdownByHeadings(file: MarkdownFile): MarkdownFile[] {
+  if (!/^## /m.test(file.content)) {
+    return [file];
+  }
+
+  const parts: string[] = [];
+  let current: string[] = [];
+
+  for (const line of file.content.split("\n")) {
+    if (line.startsWith("## ")) {
+      if (current.length > 0) {
+        parts.push(current.join("\n"));
+      }
+      current = [line];
+    } else {
+      current.push(line);
+    }
+  }
+  if (current.length > 0) {
+    parts.push(current.join("\n"));
+  }
+
+  if (parts.length <= 1) return [file];
+
+  return parts.map((content) => ({ path: file.path, content }));
+}
+
 /**
  * Build a documentation package from markdown files.
  */
@@ -95,7 +126,18 @@ export function buildPackage(
     const allSections: DocSection[] = [];
     const seenHashes = new Set<string>();
 
-    for (const file of files) {
+    // Pre-split oversized files by ## headings to avoid OOM during AST parsing
+    const processedFiles = files.flatMap((file) => {
+      if (
+        file.content.length > MAX_FILE_SIZE_FOR_PARSING &&
+        /\.(md|mdx|txt)$/i.test(file.path)
+      ) {
+        return splitMarkdownByHeadings(file);
+      }
+      return [file];
+    });
+
+    for (const file of processedFiles) {
       try {
         const parsed = parseDocument(file.content, file.path);
         for (const section of parsed.sections) {