From ae3dc4c5e67fe93cc7fd2b94a123582a9864f882 Mon Sep 17 00:00:00 2001 From: Kevin David Date: Sun, 10 May 2026 23:59:41 -0400 Subject: [PATCH 1/4] fix: prevent silent content loss when splitting long messages MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit splitMessageImpl was discarding the tail of any single paragraph longer than maxLength via `para.slice(0, maxLength)` — that tail just vanished. Visible symptom: long fenced blocks (ASCII tables, etc.) ended mid-row with the rest of the response missing from Discord entirely. Rewrite splitMessageImpl to never drop content: - Paragraph fits in current chunk → append normally - Paragraph alone too long → flush current, split paragraph by lines - Line alone too long → hard-split by characters (last-resort fallback) Also add balanceCodeFences post-pass: when a chunk ends with an open ``` fence, close it there and re-open with the same language tag at the start of the next chunk. Without this, a fenced block split across multiple Discord messages renders as one fenced + one raw-text message. Discovered via instrumented debug logs that showed chunk sizes summing to less than the buffer length on every split (e.g. buffer=2706 but chunks=[63,1904], 739 chars lost). Co-Authored-By: Claude Opus 4.7 (1M context) --- src/formatting.ts | 91 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 18 deletions(-) diff --git a/src/formatting.ts b/src/formatting.ts index ec44023..340f5e9 100644 --- a/src/formatting.ts +++ b/src/formatting.ts @@ -507,33 +507,88 @@ function formatHighDetailsLegacy( function splitMessageImpl(text: string, maxLength: number): string[] { if (text.length <= maxLength) return [text]; - // Split at paragraph boundaries first, then lines + // Split at paragraph boundaries first; for paragraphs that are individually + // too long, fall back to line-splitting; for single lines that are too long, + // fall back to hard-character splitting. All content must reach a chunk — + // never truncate-and-drop. const paragraphs = text.split("\n\n"); const chunks: string[] = []; let current = ""; - for (const para of paragraphs) { - const candidate = current ? `${current}\n\n${para}` : para; - if (candidate.length > maxLength && current) { - chunks.push(current); - current = para.length > maxLength ? para.slice(0, maxLength) : para; - } else if (candidate.length > maxLength) { - // Single paragraph too long, split at line boundaries - const lines = para.split("\n"); - for (const line of lines) { - const lineCandidate = current ? `${current}\n${line}` : line; - if (lineCandidate.length > maxLength && current) { - chunks.push(current); - current = line; - } else { - current = lineCandidate; + const flush = () => { + if (current) { chunks.push(current); current = ""; } + }; + + const pushLines = (para: string) => { + const lines = para.split("\n"); + for (const line of lines) { + if (line.length > maxLength) { + // Single line too long — hard split by characters so nothing is lost. + flush(); + for (let i = 0; i < line.length; i += maxLength) { + chunks.push(line.slice(i, i + maxLength)); } + continue; + } + const lineCandidate = current ? `${current}\n${line}` : line; + if (lineCandidate.length > maxLength) { + flush(); + current = line; + } else { + current = lineCandidate; } + } + }; + + for (const para of paragraphs) { + if (para.length > maxLength) { + // Paragraph alone exceeds the limit — flush whatever we've buffered + // and split this paragraph by lines so the full content survives. + flush(); + pushLines(para); + continue; + } + const candidate = current ? `${current}\n\n${para}` : para; + if (candidate.length > maxLength) { + flush(); + current = para; } else { current = candidate; } } - if (current) chunks.push(current); - return chunks; + flush(); + return balanceCodeFences(chunks); +} + +/** + * Close any open ``` fence at the end of each chunk and re-open it (with the + * same language tag) at the start of the next. Without this, a long fenced + * block (e.g. an ASCII table) split across multiple Discord messages renders + * with the second message as raw text and the unclosed first message as code. + */ +function balanceCodeFences(chunks: string[]): string[] { + const result: string[] = []; + let pendingOpenFence: string | null = null; + + for (let chunk of chunks) { + if (pendingOpenFence) { + chunk = `${pendingOpenFence}\n${chunk}`; + pendingOpenFence = null; + } + + // Match ``` at start of a line, optionally followed by a language tag. + // Triple-backticks mid-line aren't valid fences; ignore them. + const fences = chunk.match(/^```[a-zA-Z0-9_-]*/gm) ?? []; + if (fences.length % 2 === 1) { + // Odd count → final fence in this chunk is opening, not closed. + // Close it here, re-open with the same tag at the next chunk. + pendingOpenFence = fences[fences.length - 1]; + chunk = `${chunk}\n\`\`\``; + } + + result.push(chunk); + } + + return result; } /** @deprecated Use renderToolCard instead */ From 68ca42b4f052c7818fe469b121775344d18e5492 Mon Sep 17 00:00:00 2001 From: Kevin David Date: Mon, 11 May 2026 00:37:16 -0400 Subject: [PATCH 2/4] test: add splitMessage regression test for content-loss bug + general coverage MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Includes a regression test that fails on the pre-fix implementation (verified by temporarily reverting the file — the test caught it), plus general coverage for: - single-chunk passthrough - paragraph-boundary splits - hard char-splitting when a single line exceeds maxLength - code-fence balancing across chunk boundaries - mixed-shape inputs (short + long + short paragraphs) Co-Authored-By: Claude Opus 4.7 (1M context) --- src/__tests__/formatting.test.ts | 112 +++++++++++++++++++++++++++++++ 1 file changed, 112 insertions(+) diff --git a/src/__tests__/formatting.test.ts b/src/__tests__/formatting.test.ts index d35e251..4fddd6b 100644 --- a/src/__tests__/formatting.test.ts +++ b/src/__tests__/formatting.test.ts @@ -4,6 +4,7 @@ import { renderToolCard, renderUsageEmbed, renderPermissionEmbed, + splitMessage, splitToolCardDescription, } from "../formatting.js"; import type { OutputMode, ToolDisplaySpec, ToolCardSnapshot } from "../formatting.js"; @@ -589,3 +590,114 @@ describe("renderPermissionEmbed", () => { expect((alwaysBtn as any).data.style).toBe(ButtonStyle.Secondary); }); }); + +// ─── splitMessage ─────────────────────────────────────────────────────────── + +describe("splitMessage", () => { + it("returns a single chunk when input fits in maxLength", () => { + const text = "short message"; + expect(splitMessage(text, 100)).toEqual([text]); + }); + + it("splits at paragraph boundaries when paragraphs fit individually", () => { + const a = "a".repeat(40); + const b = "b".repeat(40); + const c = "c".repeat(40); + const text = `${a}\n\n${b}\n\n${c}`; // 40+2+40+2+40 = 124 chars + const chunks = splitMessage(text, 90); + // Each chunk should contain whole paragraphs joined by \n\n where possible. + expect(chunks.length).toBeGreaterThanOrEqual(2); + // Reassembling concatenates with \n\n between chunks (each chunk is one or + // more whole paragraphs). All three source paragraphs survive. + const reassembled = chunks.join("\n\n"); + expect(reassembled).toContain(a); + expect(reassembled).toContain(b); + expect(reassembled).toContain(c); + // No chunk exceeds maxLength. + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(90); + }); + + // Regression test for a content-loss bug fixed in this PR. The previous + // implementation handled this exact shape (a short paragraph followed by an + // over-long paragraph) with: + // if (candidate.length > maxLength && current) { + // chunks.push(current); + // current = para.length > maxLength ? para.slice(0, maxLength) : para; + // } + // The `slice(0, maxLength)` silently dropped everything past maxLength of + // the long paragraph. Discovered live with buffer=2706 producing + // chunks=[63, 1904] — 739 chars lost. + // + // The fix flushes `current`, then routes the over-long paragraph through + // line-splitting (and lines through char-splitting as a last resort). + it("preserves every character when a small paragraph precedes a long one", () => { + const intro = "intro"; + // 6 lines × ~190 chars = ~1140 chars, one paragraph, no internal \n\n. + const longLines = Array.from({ length: 6 }, (_, i) => `line ${i}: ${"x".repeat(180)}`); + const longPara = longLines.join("\n"); + const text = `${intro}\n\n${longPara}`; + expect(longPara.length).toBeGreaterThan(500); + + const chunks = splitMessage(text, 500); + // The intro must survive. + expect(chunks.some((c) => c.includes(intro))).toBe(true); + // Every line of the long paragraph must survive — the old code dropped + // chars from index 500 onward of `longPara`. + for (const line of longLines) { + expect(chunks.some((c) => c.includes(line))).toBe(true); + } + // No chunk exceeds maxLength. + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(500); + }); + + it("hard-splits a single line that exceeds maxLength so nothing is lost", () => { + // One line, no newlines, way over the limit. + const text = "z".repeat(1500); + const chunks = splitMessage(text, 500); + expect(chunks.length).toBeGreaterThanOrEqual(3); + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(500); + // Every byte still accounted for. + expect(chunks.join("").length).toBe(1500); + expect(chunks.join("")).toBe(text); + }); + + it("balances unclosed code fences across chunk boundaries", () => { + // A long fenced block that has to span multiple chunks. + const inside = Array.from({ length: 40 }, (_, i) => `row-${i}: ${"x".repeat(40)}`).join("\n"); + const text = "```text\n" + inside + "\n```"; + const chunks = splitMessage(text, 500); + expect(chunks.length).toBeGreaterThanOrEqual(2); + // First chunk opens a fence and must close it. + expect(chunks[0].startsWith("```text")).toBe(true); + expect(chunks[0].trimEnd().endsWith("```")).toBe(true); + // Subsequent chunks re-open with the same tag, and the last chunk + // contains the original closing fence. + for (let i = 1; i < chunks.length; i++) { + expect(chunks[i].startsWith("```text")).toBe(true); + } + expect(chunks[chunks.length - 1].trimEnd().endsWith("```")).toBe(true); + // No chunk has an unbalanced number of fence boundaries (each chunk + // should have an even number of ``` lines after balancing). + for (const chunk of chunks) { + const fenceCount = (chunk.match(/^```[a-zA-Z0-9_-]*/gm) ?? []).length; + expect(fenceCount % 2).toBe(0); + } + }); + + it("handles a mix of small paragraphs and one paragraph over the limit", () => { + const intro = "Here is some intro text."; + const longLines = Array.from({ length: 8 }, (_, i) => `data line ${i}: ${"y".repeat(80)}`); + const longPara = longLines.join("\n"); + const outro = "And a short outro."; + const text = `${intro}\n\n${longPara}\n\n${outro}`; + + const chunks = splitMessage(text, 400); + // No truncation of any kind. + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(400); + const all = chunks.join("\n"); + expect(all).toContain(intro); + for (const line of longLines) expect(all).toContain(line); + expect(all).toContain(outro); + }); +}); + From 49d672cdf720b8e538a27cd39a1b701180f5150a Mon Sep 17 00:00:00 2001 From: Kevin David Date: Mon, 11 May 2026 08:54:48 -0400 Subject: [PATCH 3/4] =?UTF-8?q?fix:=20balanceCodeFences=20=E2=80=94=20trac?= =?UTF-8?q?k=20open=20fence=20by=20state,=20don't=20carry=20untagged=20clo?= =?UTF-8?q?ses?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses review feedback on #11. The previous balanceCodeFences treated fences as anonymous tokens — when fence count came out odd, it set `pendingOpenFence = fences[fences.length - 1]`. For a chunk whose fences are [open, close, close] (e.g. a balanced fenced block followed by a stray closing ```), the 'last fence' is a closing ``` with no language tag. That bare ``` then got prepended to the next chunk, silently stripping the language tag from every subsequent chunk. Walk fences in order and toggle the currently-open fence instead. At chunk end, whatever's open is what needs to be re-opened in the next chunk — and only if it's tagged. Naively trusting the last fence to be the opening one fails on [open, close, close]. Includes a regression test that exercises the [plain, balanced+orphan, tagged-block] arrangement and verifies the third chunk reopens with ```python rather than a bare ```. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/__tests__/formatting.test.ts | 29 +++++++++++++++++++++++++++++ src/formatting.ts | 32 ++++++++++++++++++++++++-------- 2 files changed, 53 insertions(+), 8 deletions(-) diff --git a/src/__tests__/formatting.test.ts b/src/__tests__/formatting.test.ts index 4fddd6b..7859faa 100644 --- a/src/__tests__/formatting.test.ts +++ b/src/__tests__/formatting.test.ts @@ -699,5 +699,34 @@ describe("splitMessage", () => { for (const line of longLines) expect(all).toContain(line); expect(all).toContain(outro); }); + + // Regression test for a code-fence balancing edge case raised in #11 review. + // balanceCodeFences treats fences as a sequence and (in the buggy version) + // sets pendingOpenFence to `fences[fences.length - 1]`. When a chunk has odd + // fence count but the LAST fence is an untagged closing ``` (e.g., a fence + // sequence of [open, close, close]), the next chunk gets an untagged ``` + // prepended instead of the original language tag. + // + // Trigger: input with a balanced fence followed by an extra ``` (malformed + // by intent, but LLMs do emit this), then a separate tagged fence. The + // middle chunk's fences = [python, close, close]; the bug propagates the + // last close as if it were an open, stripping the language tag from the + // next chunk's reopening. + it("propagates the LANGUAGE TAG (not bare ```) when an extra closing fence appears in a prior chunk", () => { + const text = [ + "This is plain text.", + "```python\nA\n```\nB\n```", // balanced block + orphan close + "```python\nmore code\n```", + ].join("\n\n"); + const chunks = splitMessage(text, 30); + // The chunk carrying "more code" must reopen with `\`\`\`python`, not + // a bare untagged `\`\`\`` that lost the language. (The Discord render + // for an untagged fence still works, but the bug would silently swallow + // the language tag across all subsequent chunks.) + const codeChunk = chunks.find((c) => c.includes("more code")); + expect(codeChunk).toBeDefined(); + const firstLine = codeChunk!.split("\n")[0]; + expect(firstLine).toBe("```python"); + }); }); diff --git a/src/formatting.ts b/src/formatting.ts index 340f5e9..b3cb174 100644 --- a/src/formatting.ts +++ b/src/formatting.ts @@ -564,25 +564,41 @@ function splitMessageImpl(text: string, maxLength: number): string[] { * same language tag) at the start of the next. Without this, a long fenced * block (e.g. an ASCII table) split across multiple Discord messages renders * with the second message as raw text and the unclosed first message as code. + * + * Implementation: walk the fences in order, tracking which fence (if any) is + * currently open. Whatever's open at the end of a chunk is what needs to be + * re-opened in the next chunk — and only if it's TAGGED. Naively grabbing the + * "last fence" misidentifies a closing ``` as an opening one when a chunk + * has [open, close, close] (e.g. a balanced block plus a stray closer). */ function balanceCodeFences(chunks: string[]): string[] { + // Triple-backtick at start of a line, optionally followed by a language tag. + const FENCE_RE = /^```[a-zA-Z0-9_-]*/gm; + // Only TAGGED opens are worth carrying forward — re-opening with an untagged + // ``` doesn't preserve any information the user couldn't infer. + const isTaggedOpen = (fence: string): boolean => fence.length > 3; + const result: string[] = []; let pendingOpenFence: string | null = null; for (let chunk of chunks) { - if (pendingOpenFence) { + if (pendingOpenFence !== null) { chunk = `${pendingOpenFence}\n${chunk}`; pendingOpenFence = null; } - // Match ``` at start of a line, optionally followed by a language tag. - // Triple-backticks mid-line aren't valid fences; ignore them. - const fences = chunk.match(/^```[a-zA-Z0-9_-]*/gm) ?? []; - if (fences.length % 2 === 1) { - // Odd count → final fence in this chunk is opening, not closed. - // Close it here, re-open with the same tag at the next chunk. - pendingOpenFence = fences[fences.length - 1]; + // Walk fences in order, toggling the currently-open fence. + let openFence: string | null = null; + for (const fence of chunk.match(FENCE_RE) ?? []) { + openFence = openFence === null ? fence : null; + } + + if (openFence !== null) { + // Close it here so the chunk renders as a self-contained block. chunk = `${chunk}\n\`\`\``; + // Carry forward only if tagged — an untagged carry-over is no + // improvement (and would corrupt the visual code-block style). + if (isTaggedOpen(openFence)) pendingOpenFence = openFence; } result.push(chunk); From 3e59b05e0c106a7462b59cb88ae3a248ff8ee531 Mon Sep 17 00:00:00 2001 From: Kevin David Date: Mon, 11 May 2026 12:07:39 -0400 Subject: [PATCH 4/4] fix: re-open untagged ``` fences when splitting a long fenced block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prior balanceCodeFences fix dropped untagged carry-over entirely to avoid the "extra closer corrupts language tag" case. That suppressed a legitimate scenario: a long bare ```...``` block (e.g. an ASCII table with no language tag) split across chunks would leave the continuation chunks rendering as raw text. Refine the rule: an untagged open at the end of a chunk carries forward only if the chunk has content after the trailing fence — i.e., we genuinely split mid-block, not "balanced block followed by a dangling orphan closer". Tagged opens still always carry. Add a regression test covering the split-untagged-block path. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/__tests__/formatting.test.ts | 22 ++++++++++++++ src/formatting.ts | 52 ++++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 16 deletions(-) diff --git a/src/__tests__/formatting.test.ts b/src/__tests__/formatting.test.ts index 7859faa..4c4d15e 100644 --- a/src/__tests__/formatting.test.ts +++ b/src/__tests__/formatting.test.ts @@ -728,5 +728,27 @@ describe("splitMessage", () => { const firstLine = codeChunk!.split("\n")[0]; expect(firstLine).toBe("```python"); }); + + // An untagged ``` fence (no language hint) wrapping a long block — split + // across chunks must reopen with ``` on the continuation chunk so the rest + // of the content stays inside the fenced block. Previously, untagged carry- + // over was suppressed (to avoid the "extra closer corrupts language tag" + // case above), which broke this legitimate scenario. + it("re-opens an untagged ``` fence across chunk boundaries", () => { + const rows = Array.from({ length: 30 }, (_, i) => `| col1-${i} | col2-${i} | col3-${i} |`); + const text = "```\n" + rows.join("\n") + "\n```"; + const chunks = splitMessage(text, 500); + expect(chunks.length).toBeGreaterThanOrEqual(2); + // Every continuation chunk must reopen with ``` so content stays fenced. + for (let i = 1; i < chunks.length; i++) { + const firstLine = chunks[i].split("\n")[0]; + expect(firstLine).toBe("```"); + } + // No chunk should leave a row unwrapped — the second chunk shouldn't + // start with the literal pipe content. + for (let i = 1; i < chunks.length; i++) { + expect(chunks[i].startsWith("|")).toBe(false); + } + }); }); diff --git a/src/formatting.ts b/src/formatting.ts index b3cb174..7b35886 100644 --- a/src/formatting.ts +++ b/src/formatting.ts @@ -560,23 +560,31 @@ function splitMessageImpl(text: string, maxLength: number): string[] { } /** - * Close any open ``` fence at the end of each chunk and re-open it (with the - * same language tag) at the start of the next. Without this, a long fenced - * block (e.g. an ASCII table) split across multiple Discord messages renders - * with the second message as raw text and the unclosed first message as code. + * Close any open ``` fence at the end of each chunk and re-open it at the + * start of the next (with the same language tag if any). Without this, a long + * fenced block split across Discord messages renders with the continuation + * as raw text. * - * Implementation: walk the fences in order, tracking which fence (if any) is - * currently open. Whatever's open at the end of a chunk is what needs to be - * re-opened in the next chunk — and only if it's TAGGED. Naively grabbing the - * "last fence" misidentifies a closing ``` as an opening one when a chunk - * has [open, close, close] (e.g. a balanced block plus a stray closer). + * Implementation: walk fences in order, tracking which fence (if any) is + * currently open via state machine. Whatever's open at the end of a chunk + * is what needs to be re-opened in the next chunk — with one nuance for + * untagged fences: + * + * - TAGGED open (e.g. ```python) at chunk end: always carry forward. + * Losing a language tag across a split is the worst-case outcome. + * + * - UNTAGGED open (bare ```) at chunk end: carry forward ONLY if the chunk + * has content after the trailing fence. A bare ``` as the last non-blank + * line is more likely an LLM emitting a dangling/orphan close (after a + * balanced block) than the start of a new code block. Carrying that + * untagged fence into the next chunk would corrupt any later language + * tag (e.g. prepending ``` in front of a ```python that should stay). */ function balanceCodeFences(chunks: string[]): string[] { // Triple-backtick at start of a line, optionally followed by a language tag. const FENCE_RE = /^```[a-zA-Z0-9_-]*/gm; - // Only TAGGED opens are worth carrying forward — re-opening with an untagged - // ``` doesn't preserve any information the user couldn't infer. - const isTaggedOpen = (fence: string): boolean => fence.length > 3; + // Anchored variant for "this whole line is a fence" check. + const FENCE_LINE_RE = /^```[a-zA-Z0-9_-]*$/; const result: string[] = []; let pendingOpenFence: string | null = null; @@ -594,11 +602,23 @@ function balanceCodeFences(chunks: string[]): string[] { } if (openFence !== null) { - // Close it here so the chunk renders as a self-contained block. + // Decide whether to carry forward BEFORE we mutate the chunk. + let shouldCarry: boolean; + if (openFence.length > 3) { + // Tagged open — always carry forward. + shouldCarry = true; + } else { + // Untagged open — only carry if there's content after the trailing + // fence (i.e., the chunk genuinely splits a fenced block mid-content). + const lines = chunk.split("\n"); + let i = lines.length - 1; + while (i >= 0 && lines[i].trim() === "") i--; + shouldCarry = i >= 0 && !FENCE_LINE_RE.test(lines[i]); + } + + // Close the chunk so it renders as a self-contained block either way. chunk = `${chunk}\n\`\`\``; - // Carry forward only if tagged — an untagged carry-over is no - // improvement (and would corrupt the visual code-block style). - if (isTaggedOpen(openFence)) pendingOpenFence = openFence; + if (shouldCarry) pendingOpenFence = openFence; } result.push(chunk);