Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
163 changes: 163 additions & 0 deletions src/__tests__/formatting.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@ import {
renderToolCard,
renderUsageEmbed,
renderPermissionEmbed,
splitMessage,
splitToolCardDescription,
} from "../formatting.js";
import type { OutputMode, ToolDisplaySpec, ToolCardSnapshot } from "../formatting.js";
Expand Down Expand Up @@ -589,3 +590,165 @@ describe("renderPermissionEmbed", () => {
expect((alwaysBtn as any).data.style).toBe(ButtonStyle.Secondary);
});
});

// ─── splitMessage ───────────────────────────────────────────────────────────

describe("splitMessage", () => {
it("returns a single chunk when input fits in maxLength", () => {
const text = "short message";
expect(splitMessage(text, 100)).toEqual([text]);
});

it("splits at paragraph boundaries when paragraphs fit individually", () => {
const a = "a".repeat(40);
const b = "b".repeat(40);
const c = "c".repeat(40);
const text = `${a}\n\n${b}\n\n${c}`; // 40+2+40+2+40 = 124 chars
const chunks = splitMessage(text, 90);
// Each chunk should contain whole paragraphs joined by \n\n where possible.
expect(chunks.length).toBeGreaterThanOrEqual(2);
// Reassembling concatenates with \n\n between chunks (each chunk is one or
// more whole paragraphs). All three source paragraphs survive.
const reassembled = chunks.join("\n\n");
expect(reassembled).toContain(a);
expect(reassembled).toContain(b);
expect(reassembled).toContain(c);
// No chunk exceeds maxLength.
for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(90);
});

// Regression test for a content-loss bug fixed in this PR. The previous
// implementation handled this exact shape (a short paragraph followed by an
// over-long paragraph) with:
// if (candidate.length > maxLength && current) {
// chunks.push(current);
// current = para.length > maxLength ? para.slice(0, maxLength) : para;
// }
// The `slice(0, maxLength)` silently dropped everything past maxLength of
// the long paragraph. Discovered live with buffer=2706 producing
// chunks=[63, 1904] — 739 chars lost.
//
// The fix flushes `current`, then routes the over-long paragraph through
// line-splitting (and lines through char-splitting as a last resort).
it("preserves every character when a small paragraph precedes a long one", () => {
const intro = "intro";
// 6 lines × ~190 chars = ~1140 chars, one paragraph, no internal \n\n.
const longLines = Array.from({ length: 6 }, (_, i) => `line ${i}: ${"x".repeat(180)}`);
const longPara = longLines.join("\n");
const text = `${intro}\n\n${longPara}`;
expect(longPara.length).toBeGreaterThan(500);

const chunks = splitMessage(text, 500);
// The intro must survive.
expect(chunks.some((c) => c.includes(intro))).toBe(true);
// Every line of the long paragraph must survive — the old code dropped
// chars from index 500 onward of `longPara`.
for (const line of longLines) {
expect(chunks.some((c) => c.includes(line))).toBe(true);
}
// No chunk exceeds maxLength.
for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(500);
});

it("hard-splits a single line that exceeds maxLength so nothing is lost", () => {
// One line, no newlines, way over the limit.
const text = "z".repeat(1500);
const chunks = splitMessage(text, 500);
expect(chunks.length).toBeGreaterThanOrEqual(3);
for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(500);
// Every byte still accounted for.
expect(chunks.join("").length).toBe(1500);
expect(chunks.join("")).toBe(text);
});

it("balances unclosed code fences across chunk boundaries", () => {
// A long fenced block that has to span multiple chunks.
const inside = Array.from({ length: 40 }, (_, i) => `row-${i}: ${"x".repeat(40)}`).join("\n");
const text = "```text\n" + inside + "\n```";
const chunks = splitMessage(text, 500);
expect(chunks.length).toBeGreaterThanOrEqual(2);
// First chunk opens a fence and must close it.
expect(chunks[0].startsWith("```text")).toBe(true);
expect(chunks[0].trimEnd().endsWith("```")).toBe(true);
// Subsequent chunks re-open with the same tag, and the last chunk
// contains the original closing fence.
for (let i = 1; i < chunks.length; i++) {
expect(chunks[i].startsWith("```text")).toBe(true);
}
expect(chunks[chunks.length - 1].trimEnd().endsWith("```")).toBe(true);
// No chunk has an unbalanced number of fence boundaries (each chunk
// should have an even number of ``` lines after balancing).
for (const chunk of chunks) {
const fenceCount = (chunk.match(/^```[a-zA-Z0-9_-]*/gm) ?? []).length;
expect(fenceCount % 2).toBe(0);
}
});

it("handles a mix of small paragraphs and one paragraph over the limit", () => {
const intro = "Here is some intro text.";
const longLines = Array.from({ length: 8 }, (_, i) => `data line ${i}: ${"y".repeat(80)}`);
const longPara = longLines.join("\n");
const outro = "And a short outro.";
const text = `${intro}\n\n${longPara}\n\n${outro}`;

const chunks = splitMessage(text, 400);
// No truncation of any kind.
for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(400);
const all = chunks.join("\n");
expect(all).toContain(intro);
for (const line of longLines) expect(all).toContain(line);
expect(all).toContain(outro);
});

// Regression test for a code-fence balancing edge case raised in #11 review.
// balanceCodeFences treats fences as a sequence and (in the buggy version)
// sets pendingOpenFence to `fences[fences.length - 1]`. When a chunk has odd
// fence count but the LAST fence is an untagged closing ``` (e.g., a fence
// sequence of [open, close, close]), the next chunk gets an untagged ```
// prepended instead of the original language tag.
//
// Trigger: input with a balanced fence followed by an extra ``` (malformed
// by intent, but LLMs do emit this), then a separate tagged fence. The
// middle chunk's fences = [python, close, close]; the bug propagates the
// last close as if it were an open, stripping the language tag from the
// next chunk's reopening.
it("propagates the LANGUAGE TAG (not bare ```) when an extra closing fence appears in a prior chunk", () => {
const text = [
"This is plain text.",
"```python\nA\n```\nB\n```", // balanced block + orphan close
"```python\nmore code\n```",
].join("\n\n");
const chunks = splitMessage(text, 30);
// The chunk carrying "more code" must reopen with `\`\`\`python`, not
// a bare untagged `\`\`\`` that lost the language. (The Discord render
// for an untagged fence still works, but the bug would silently swallow
// the language tag across all subsequent chunks.)
const codeChunk = chunks.find((c) => c.includes("more code"));
expect(codeChunk).toBeDefined();
const firstLine = codeChunk!.split("\n")[0];
expect(firstLine).toBe("```python");
});

// An untagged ``` fence (no language hint) wrapping a long block — split
// across chunks must reopen with ``` on the continuation chunk so the rest
// of the content stays inside the fenced block. Previously, untagged carry-
// over was suppressed (to avoid the "extra closer corrupts language tag"
// case above), which broke this legitimate scenario.
it("re-opens an untagged ``` fence across chunk boundaries", () => {
const rows = Array.from({ length: 30 }, (_, i) => `| col1-${i} | col2-${i} | col3-${i} |`);
const text = "```\n" + rows.join("\n") + "\n```";
const chunks = splitMessage(text, 500);
expect(chunks.length).toBeGreaterThanOrEqual(2);
// Every continuation chunk must reopen with ``` so content stays fenced.
for (let i = 1; i < chunks.length; i++) {
const firstLine = chunks[i].split("\n")[0];
expect(firstLine).toBe("```");
}
// No chunk should leave a row unwrapped — the second chunk shouldn't
// start with the literal pipe content.
for (let i = 1; i < chunks.length; i++) {
expect(chunks[i].startsWith("|")).toBe(false);
}
});
});

127 changes: 109 additions & 18 deletions src/formatting.ts
Original file line number Diff line number Diff line change
Expand Up @@ -507,33 +507,124 @@ function formatHighDetailsLegacy(

function splitMessageImpl(text: string, maxLength: number): string[] {
Comment thread
kevin-david marked this conversation as resolved.
if (text.length <= maxLength) return [text];
// Split at paragraph boundaries first, then lines
// Split at paragraph boundaries first; for paragraphs that are individually
// too long, fall back to line-splitting; for single lines that are too long,
// fall back to hard-character splitting. All content must reach a chunk —
// never truncate-and-drop.
const paragraphs = text.split("\n\n");
const chunks: string[] = [];
let current = "";
for (const para of paragraphs) {
const candidate = current ? `${current}\n\n${para}` : para;
if (candidate.length > maxLength && current) {
chunks.push(current);
current = para.length > maxLength ? para.slice(0, maxLength) : para;
} else if (candidate.length > maxLength) {
// Single paragraph too long, split at line boundaries
const lines = para.split("\n");
for (const line of lines) {
const lineCandidate = current ? `${current}\n${line}` : line;
if (lineCandidate.length > maxLength && current) {
chunks.push(current);
current = line;
} else {
current = lineCandidate;
const flush = () => {
if (current) { chunks.push(current); current = ""; }
};

const pushLines = (para: string) => {
const lines = para.split("\n");
for (const line of lines) {
if (line.length > maxLength) {
// Single line too long — hard split by characters so nothing is lost.
flush();
for (let i = 0; i < line.length; i += maxLength) {
chunks.push(line.slice(i, i + maxLength));
}
continue;
}
const lineCandidate = current ? `${current}\n${line}` : line;
if (lineCandidate.length > maxLength) {
flush();
current = line;
} else {
current = lineCandidate;
}
}
};

for (const para of paragraphs) {
if (para.length > maxLength) {
// Paragraph alone exceeds the limit — flush whatever we've buffered
// and split this paragraph by lines so the full content survives.
flush();
pushLines(para);
continue;
}
const candidate = current ? `${current}\n\n${para}` : para;
if (candidate.length > maxLength) {
flush();
current = para;
} else {
current = candidate;
}
}
if (current) chunks.push(current);
return chunks;
flush();
return balanceCodeFences(chunks);
}

/**
* Close any open ``` fence at the end of each chunk and re-open it at the
* start of the next (with the same language tag if any). Without this, a long
* fenced block split across Discord messages renders with the continuation
* as raw text.
*
* Implementation: walk fences in order, tracking which fence (if any) is
* currently open via state machine. Whatever's open at the end of a chunk
* is what needs to be re-opened in the next chunk — with one nuance for
* untagged fences:
*
* - TAGGED open (e.g. ```python) at chunk end: always carry forward.
* Losing a language tag across a split is the worst-case outcome.
*
* - UNTAGGED open (bare ```) at chunk end: carry forward ONLY if the chunk
* has content after the trailing fence. A bare ``` as the last non-blank
* line is more likely an LLM emitting a dangling/orphan close (after a
* balanced block) than the start of a new code block. Carrying that
* untagged fence into the next chunk would corrupt any later language
* tag (e.g. prepending ``` in front of a ```python that should stay).
*/
function balanceCodeFences(chunks: string[]): string[] {
// Triple-backtick at start of a line, optionally followed by a language tag.
const FENCE_RE = /^```[a-zA-Z0-9_-]*/gm;
// Anchored variant for "this whole line is a fence" check.
const FENCE_LINE_RE = /^```[a-zA-Z0-9_-]*$/;

const result: string[] = [];
let pendingOpenFence: string | null = null;

for (let chunk of chunks) {
if (pendingOpenFence !== null) {
chunk = `${pendingOpenFence}\n${chunk}`;
pendingOpenFence = null;
}

// Walk fences in order, toggling the currently-open fence.
let openFence: string | null = null;
for (const fence of chunk.match(FENCE_RE) ?? []) {
openFence = openFence === null ? fence : null;
}

if (openFence !== null) {
// Decide whether to carry forward BEFORE we mutate the chunk.
let shouldCarry: boolean;
if (openFence.length > 3) {
// Tagged open — always carry forward.
shouldCarry = true;
} else {
// Untagged open — only carry if there's content after the trailing
// fence (i.e., the chunk genuinely splits a fenced block mid-content).
const lines = chunk.split("\n");
let i = lines.length - 1;
while (i >= 0 && lines[i].trim() === "") i--;
shouldCarry = i >= 0 && !FENCE_LINE_RE.test(lines[i]);
}

// Close the chunk so it renders as a self-contained block either way.
chunk = `${chunk}\n\`\`\``;
if (shouldCarry) pendingOpenFence = openFence;
}

result.push(chunk);
}

return result;
}

/** @deprecated Use renderToolCard instead */
Expand Down