Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 5 additions & 0 deletions .changeset/thin-ligers-walk.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
---
"@neuledge/context": patch
---

Fix OOM crash when building packages from large llms-full.txt files (e.g., Cloudflare docs). Large markdown files (>1MB) are now pre-split by `##` headings before AST parsing so individual chunks stay small.
119 changes: 118 additions & 1 deletion packages/context/src/package-builder.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ import { tmpdir } from "node:os";
import { join } from "node:path";
import { afterEach, beforeAll, describe, expect, it } from "vitest";
import { initDatabase, openDatabase } from "./database.js";
import { buildPackage } from "./package-builder.js";
import { buildPackage, splitMarkdownByHeadings } from "./package-builder.js";

describe("buildPackage", () => {
beforeAll(async () => {
Expand Down Expand Up @@ -309,3 +309,120 @@ Run the install command.
}
});
});

describe("splitMarkdownByHeadings", () => {
it("splits content by ## headings into preamble + per-section parts", () => {
const file = {
path: "test.txt",
content: `# Docs

Intro text.

## Workers

Workers content.

## Pages

Pages content.`,
};
const result = splitMarkdownByHeadings(file);
// preamble + 2 sections = 3 parts
expect(result).toHaveLength(3);
expect(result[0]?.content).toContain("Intro text.");
expect(result[0]?.content).not.toContain("## Workers");
expect(result[1]?.content).toContain("## Workers");
expect(result[1]?.content).toContain("Workers content.");
expect(result[2]?.content).toContain("## Pages");
expect(result[2]?.content).toContain("Pages content.");
});

it("handles content starting with ## (no preamble)", () => {
const file = {
path: "test.txt",
content: `## Alpha

Alpha content.

## Beta

Beta content.`,
};
const result = splitMarkdownByHeadings(file);
expect(result).toHaveLength(2);
expect(result[0]?.content).toMatch(/^## Alpha/);
expect(result[0]?.content).toContain("Alpha content.");
expect(result[1]?.content).toMatch(/^## Beta/);
expect(result[1]?.content).toContain("Beta content.");
});

it("returns original file when no ## headings exist", () => {
const file = {
path: "readme.md",
content: "# Title\n\nJust a single section.\n\n### Subheading\n\nMore content.",
};
const result = splitMarkdownByHeadings(file);
expect(result).toEqual([file]);
});

it("returns original file when content starts with a single ## section (no split needed)", () => {
const file = {
path: "single.txt",
content: "## Only One Section\n\nContent here.",
};
const result = splitMarkdownByHeadings(file);
expect(result).toEqual([file]);
});

it("preserves the preamble before the first ## heading", () => {
const file = {
path: "docs.md",
content: `---
title: Docs
---

# Title

Intro paragraph.

## First Section

Content here.`,
};
const result = splitMarkdownByHeadings(file);
expect(result).toHaveLength(2);
expect(result[0]?.content).toContain("Intro paragraph.");
expect(result[1]?.content).toContain("## First Section");
expect(result[1]?.content).toContain("Content here.");
});

it("preserves empty lines within sections", () => {
const file = {
path: "spacing.txt",
content: `## Section A

Line 1.

Line 2.

## Section B

Line 3.`,
};
const result = splitMarkdownByHeadings(file);
expect(result).toHaveLength(2);
expect(result[0]?.content).toContain("Line 1.\n\nLine 2.");
expect(result[1]?.content).toContain("Line 3.");
});

it("preserves doc_path across all split parts", () => {
const file = {
path: "cloudflare.com/llms-full.txt",
content: `## Workers\nContent.\n\n## Pages\nContent.`,
};
const result = splitMarkdownByHeadings(file);
for (const part of result) {
expect(part.path).toBe("cloudflare.com/llms-full.txt");
}
});
});
44 changes: 43 additions & 1 deletion packages/context/src/package-builder.ts
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,37 @@ export interface BuildResult {
totalTokens: number;
}

/** Files larger than this are pre-split by ## headings before AST parsing to avoid OOM. */
const MAX_FILE_SIZE_FOR_PARSING = 1024 * 1024; // 1MB

/** Pre-split oversized markdown by ## headings into independently parseable chunks. */
export function splitMarkdownByHeadings(file: MarkdownFile): MarkdownFile[] {
if (!/^## /m.test(file.content)) {
return [file];
}

const parts: string[] = [];
let current: string[] = [];

for (const line of file.content.split("\n")) {
if (line.startsWith("## ")) {

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

One thing to watch here. If a line inside a code block starts with ## , it gets treated as a heading and the file is split in the wrong place. That could break code samples in exactly the big files this targets. Might be worth skipping lines inside fenced ``` blocks.


Generated by Claude Code

if (current.length > 0) {
parts.push(current.join("\n"));
}
current = [line];
} else {
current.push(line);
}
}
if (current.length > 0) {
parts.push(current.join("\n"));
}

if (parts.length <= 1) return [file];

return parts.map((content) => ({ path: file.path, content }));
}

/**
* Build a documentation package from markdown files.
*/
Expand Down Expand Up @@ -95,7 +126,18 @@ export function buildPackage(
const allSections: DocSection[] = [];
const seenHashes = new Set<string>();

for (const file of files) {
// Pre-split oversized files by ## headings to avoid OOM during AST parsing
const processedFiles = files.flatMap((file) => {
if (
file.content.length > MAX_FILE_SIZE_FOR_PARSING &&
/\.(md|mdx|txt)$/i.test(file.path)
) {
return splitMarkdownByHeadings(file);
}
return [file];
});

for (const file of processedFiles) {
try {
const parsed = parseDocument(file.content, file.path);
for (const section of parsed.sections) {
Expand Down
Loading