From 84864dd67c1dbc521cc270b7d131e6c4051fc811 Mon Sep 17 00:00:00 2001 From: Werner Bisschoff Date: Sat, 20 Jun 2026 16:06:14 +0200 Subject: [PATCH] fix(context): prevent OOM crash when building packages from large llms-full.txt files Large markdown files (>1MB) like Cloudflare's llms-full.txt previously caused Node.js heap OOM because remark-parse built a full AST of the entire document. Now they are pre-split by ## headings so each chunk is independently parseable with minimal memory. Fixes: #99 --- .changeset/thin-ligers-walk.md | 5 + packages/context/src/package-builder.test.ts | 119 ++++++++++++++++++- packages/context/src/package-builder.ts | 44 ++++++- 3 files changed, 166 insertions(+), 2 deletions(-) create mode 100644 .changeset/thin-ligers-walk.md diff --git a/.changeset/thin-ligers-walk.md b/.changeset/thin-ligers-walk.md new file mode 100644 index 0000000..618f855 --- /dev/null +++ b/.changeset/thin-ligers-walk.md @@ -0,0 +1,5 @@ +--- +"@neuledge/context": patch +--- + +Fix OOM crash when building packages from large llms-full.txt files (e.g., Cloudflare docs). Large markdown files (>1MB) are now pre-split by `##` headings before AST parsing so individual chunks stay small. diff --git a/packages/context/src/package-builder.test.ts b/packages/context/src/package-builder.test.ts index a3abb03..83dfd5c 100644 --- a/packages/context/src/package-builder.test.ts +++ b/packages/context/src/package-builder.test.ts @@ -3,7 +3,7 @@ import { tmpdir } from "node:os"; import { join } from "node:path"; import { afterEach, beforeAll, describe, expect, it } from "vitest"; import { initDatabase, openDatabase } from "./database.js"; -import { buildPackage } from "./package-builder.js"; +import { buildPackage, splitMarkdownByHeadings } from "./package-builder.js"; describe("buildPackage", () => { beforeAll(async () => { @@ -309,3 +309,120 @@ Run the install command. } }); }); + +describe("splitMarkdownByHeadings", () => { + it("splits content by ## headings into preamble + per-section parts", () => { + const file = { + path: "test.txt", + content: `# Docs + +Intro text. + +## Workers + +Workers content. + +## Pages + +Pages content.`, + }; + const result = splitMarkdownByHeadings(file); + // preamble + 2 sections = 3 parts + expect(result).toHaveLength(3); + expect(result[0]?.content).toContain("Intro text."); + expect(result[0]?.content).not.toContain("## Workers"); + expect(result[1]?.content).toContain("## Workers"); + expect(result[1]?.content).toContain("Workers content."); + expect(result[2]?.content).toContain("## Pages"); + expect(result[2]?.content).toContain("Pages content."); + }); + + it("handles content starting with ## (no preamble)", () => { + const file = { + path: "test.txt", + content: `## Alpha + +Alpha content. + +## Beta + +Beta content.`, + }; + const result = splitMarkdownByHeadings(file); + expect(result).toHaveLength(2); + expect(result[0]?.content).toMatch(/^## Alpha/); + expect(result[0]?.content).toContain("Alpha content."); + expect(result[1]?.content).toMatch(/^## Beta/); + expect(result[1]?.content).toContain("Beta content."); + }); + + it("returns original file when no ## headings exist", () => { + const file = { + path: "readme.md", + content: "# Title\n\nJust a single section.\n\n### Subheading\n\nMore content.", + }; + const result = splitMarkdownByHeadings(file); + expect(result).toEqual([file]); + }); + + it("returns original file when content starts with a single ## section (no split needed)", () => { + const file = { + path: "single.txt", + content: "## Only One Section\n\nContent here.", + }; + const result = splitMarkdownByHeadings(file); + expect(result).toEqual([file]); + }); + + it("preserves the preamble before the first ## heading", () => { + const file = { + path: "docs.md", + content: `--- +title: Docs +--- + +# Title + +Intro paragraph. + +## First Section + +Content here.`, + }; + const result = splitMarkdownByHeadings(file); + expect(result).toHaveLength(2); + expect(result[0]?.content).toContain("Intro paragraph."); + expect(result[1]?.content).toContain("## First Section"); + expect(result[1]?.content).toContain("Content here."); + }); + + it("preserves empty lines within sections", () => { + const file = { + path: "spacing.txt", + content: `## Section A + +Line 1. + +Line 2. + +## Section B + +Line 3.`, + }; + const result = splitMarkdownByHeadings(file); + expect(result).toHaveLength(2); + expect(result[0]?.content).toContain("Line 1.\n\nLine 2."); + expect(result[1]?.content).toContain("Line 3."); + }); + + it("preserves doc_path across all split parts", () => { + const file = { + path: "cloudflare.com/llms-full.txt", + content: `## Workers\nContent.\n\n## Pages\nContent.`, + }; + const result = splitMarkdownByHeadings(file); + for (const part of result) { + expect(part.path).toBe("cloudflare.com/llms-full.txt"); + } + }); +}); diff --git a/packages/context/src/package-builder.ts b/packages/context/src/package-builder.ts index 135ec61..1ea9795 100644 --- a/packages/context/src/package-builder.ts +++ b/packages/context/src/package-builder.ts @@ -35,6 +35,37 @@ export interface BuildResult { totalTokens: number; } +/** Files larger than this are pre-split by ## headings before AST parsing to avoid OOM. */ +const MAX_FILE_SIZE_FOR_PARSING = 1024 * 1024; // 1MB + +/** Pre-split oversized markdown by ## headings into independently parseable chunks. */ +export function splitMarkdownByHeadings(file: MarkdownFile): MarkdownFile[] { + if (!/^## /m.test(file.content)) { + return [file]; + } + + const parts: string[] = []; + let current: string[] = []; + + for (const line of file.content.split("\n")) { + if (line.startsWith("## ")) { + if (current.length > 0) { + parts.push(current.join("\n")); + } + current = [line]; + } else { + current.push(line); + } + } + if (current.length > 0) { + parts.push(current.join("\n")); + } + + if (parts.length <= 1) return [file]; + + return parts.map((content) => ({ path: file.path, content })); +} + /** * Build a documentation package from markdown files. */ @@ -95,7 +126,18 @@ export function buildPackage( const allSections: DocSection[] = []; const seenHashes = new Set(); - for (const file of files) { + // Pre-split oversized files by ## headings to avoid OOM during AST parsing + const processedFiles = files.flatMap((file) => { + if ( + file.content.length > MAX_FILE_SIZE_FOR_PARSING && + /\.(md|mdx|txt)$/i.test(file.path) + ) { + return splitMarkdownByHeadings(file); + } + return [file]; + }); + + for (const file of processedFiles) { try { const parsed = parseDocument(file.content, file.path); for (const section of parsed.sections) {