diff --git a/src/__tests__/formatting.test.ts b/src/__tests__/formatting.test.ts index d35e251..4c4d15e 100644 --- a/src/__tests__/formatting.test.ts +++ b/src/__tests__/formatting.test.ts @@ -4,6 +4,7 @@ import { renderToolCard, renderUsageEmbed, renderPermissionEmbed, + splitMessage, splitToolCardDescription, } from "../formatting.js"; import type { OutputMode, ToolDisplaySpec, ToolCardSnapshot } from "../formatting.js"; @@ -589,3 +590,165 @@ describe("renderPermissionEmbed", () => { expect((alwaysBtn as any).data.style).toBe(ButtonStyle.Secondary); }); }); + +// ─── splitMessage ─────────────────────────────────────────────────────────── + +describe("splitMessage", () => { + it("returns a single chunk when input fits in maxLength", () => { + const text = "short message"; + expect(splitMessage(text, 100)).toEqual([text]); + }); + + it("splits at paragraph boundaries when paragraphs fit individually", () => { + const a = "a".repeat(40); + const b = "b".repeat(40); + const c = "c".repeat(40); + const text = `${a}\n\n${b}\n\n${c}`; // 40+2+40+2+40 = 124 chars + const chunks = splitMessage(text, 90); + // Each chunk should contain whole paragraphs joined by \n\n where possible. + expect(chunks.length).toBeGreaterThanOrEqual(2); + // Reassembling concatenates with \n\n between chunks (each chunk is one or + // more whole paragraphs). All three source paragraphs survive. + const reassembled = chunks.join("\n\n"); + expect(reassembled).toContain(a); + expect(reassembled).toContain(b); + expect(reassembled).toContain(c); + // No chunk exceeds maxLength. + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(90); + }); + + // Regression test for a content-loss bug fixed in this PR. The previous + // implementation handled this exact shape (a short paragraph followed by an + // over-long paragraph) with: + // if (candidate.length > maxLength && current) { + // chunks.push(current); + // current = para.length > maxLength ? para.slice(0, maxLength) : para; + // } + // The `slice(0, maxLength)` silently dropped everything past maxLength of + // the long paragraph. Discovered live with buffer=2706 producing + // chunks=[63, 1904] — 739 chars lost. + // + // The fix flushes `current`, then routes the over-long paragraph through + // line-splitting (and lines through char-splitting as a last resort). + it("preserves every character when a small paragraph precedes a long one", () => { + const intro = "intro"; + // 6 lines × ~190 chars = ~1140 chars, one paragraph, no internal \n\n. + const longLines = Array.from({ length: 6 }, (_, i) => `line ${i}: ${"x".repeat(180)}`); + const longPara = longLines.join("\n"); + const text = `${intro}\n\n${longPara}`; + expect(longPara.length).toBeGreaterThan(500); + + const chunks = splitMessage(text, 500); + // The intro must survive. + expect(chunks.some((c) => c.includes(intro))).toBe(true); + // Every line of the long paragraph must survive — the old code dropped + // chars from index 500 onward of `longPara`. + for (const line of longLines) { + expect(chunks.some((c) => c.includes(line))).toBe(true); + } + // No chunk exceeds maxLength. + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(500); + }); + + it("hard-splits a single line that exceeds maxLength so nothing is lost", () => { + // One line, no newlines, way over the limit. + const text = "z".repeat(1500); + const chunks = splitMessage(text, 500); + expect(chunks.length).toBeGreaterThanOrEqual(3); + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(500); + // Every byte still accounted for. + expect(chunks.join("").length).toBe(1500); + expect(chunks.join("")).toBe(text); + }); + + it("balances unclosed code fences across chunk boundaries", () => { + // A long fenced block that has to span multiple chunks. + const inside = Array.from({ length: 40 }, (_, i) => `row-${i}: ${"x".repeat(40)}`).join("\n"); + const text = "```text\n" + inside + "\n```"; + const chunks = splitMessage(text, 500); + expect(chunks.length).toBeGreaterThanOrEqual(2); + // First chunk opens a fence and must close it. + expect(chunks[0].startsWith("```text")).toBe(true); + expect(chunks[0].trimEnd().endsWith("```")).toBe(true); + // Subsequent chunks re-open with the same tag, and the last chunk + // contains the original closing fence. + for (let i = 1; i < chunks.length; i++) { + expect(chunks[i].startsWith("```text")).toBe(true); + } + expect(chunks[chunks.length - 1].trimEnd().endsWith("```")).toBe(true); + // No chunk has an unbalanced number of fence boundaries (each chunk + // should have an even number of ``` lines after balancing). + for (const chunk of chunks) { + const fenceCount = (chunk.match(/^```[a-zA-Z0-9_-]*/gm) ?? []).length; + expect(fenceCount % 2).toBe(0); + } + }); + + it("handles a mix of small paragraphs and one paragraph over the limit", () => { + const intro = "Here is some intro text."; + const longLines = Array.from({ length: 8 }, (_, i) => `data line ${i}: ${"y".repeat(80)}`); + const longPara = longLines.join("\n"); + const outro = "And a short outro."; + const text = `${intro}\n\n${longPara}\n\n${outro}`; + + const chunks = splitMessage(text, 400); + // No truncation of any kind. + for (const chunk of chunks) expect(chunk.length).toBeLessThanOrEqual(400); + const all = chunks.join("\n"); + expect(all).toContain(intro); + for (const line of longLines) expect(all).toContain(line); + expect(all).toContain(outro); + }); + + // Regression test for a code-fence balancing edge case raised in #11 review. + // balanceCodeFences treats fences as a sequence and (in the buggy version) + // sets pendingOpenFence to `fences[fences.length - 1]`. When a chunk has odd + // fence count but the LAST fence is an untagged closing ``` (e.g., a fence + // sequence of [open, close, close]), the next chunk gets an untagged ``` + // prepended instead of the original language tag. + // + // Trigger: input with a balanced fence followed by an extra ``` (malformed + // by intent, but LLMs do emit this), then a separate tagged fence. The + // middle chunk's fences = [python, close, close]; the bug propagates the + // last close as if it were an open, stripping the language tag from the + // next chunk's reopening. + it("propagates the LANGUAGE TAG (not bare ```) when an extra closing fence appears in a prior chunk", () => { + const text = [ + "This is plain text.", + "```python\nA\n```\nB\n```", // balanced block + orphan close + "```python\nmore code\n```", + ].join("\n\n"); + const chunks = splitMessage(text, 30); + // The chunk carrying "more code" must reopen with `\`\`\`python`, not + // a bare untagged `\`\`\`` that lost the language. (The Discord render + // for an untagged fence still works, but the bug would silently swallow + // the language tag across all subsequent chunks.) + const codeChunk = chunks.find((c) => c.includes("more code")); + expect(codeChunk).toBeDefined(); + const firstLine = codeChunk!.split("\n")[0]; + expect(firstLine).toBe("```python"); + }); + + // An untagged ``` fence (no language hint) wrapping a long block — split + // across chunks must reopen with ``` on the continuation chunk so the rest + // of the content stays inside the fenced block. Previously, untagged carry- + // over was suppressed (to avoid the "extra closer corrupts language tag" + // case above), which broke this legitimate scenario. + it("re-opens an untagged ``` fence across chunk boundaries", () => { + const rows = Array.from({ length: 30 }, (_, i) => `| col1-${i} | col2-${i} | col3-${i} |`); + const text = "```\n" + rows.join("\n") + "\n```"; + const chunks = splitMessage(text, 500); + expect(chunks.length).toBeGreaterThanOrEqual(2); + // Every continuation chunk must reopen with ``` so content stays fenced. + for (let i = 1; i < chunks.length; i++) { + const firstLine = chunks[i].split("\n")[0]; + expect(firstLine).toBe("```"); + } + // No chunk should leave a row unwrapped — the second chunk shouldn't + // start with the literal pipe content. + for (let i = 1; i < chunks.length; i++) { + expect(chunks[i].startsWith("|")).toBe(false); + } + }); +}); + diff --git a/src/formatting.ts b/src/formatting.ts index ec44023..7b35886 100644 --- a/src/formatting.ts +++ b/src/formatting.ts @@ -507,33 +507,124 @@ function formatHighDetailsLegacy( function splitMessageImpl(text: string, maxLength: number): string[] { if (text.length <= maxLength) return [text]; - // Split at paragraph boundaries first, then lines + // Split at paragraph boundaries first; for paragraphs that are individually + // too long, fall back to line-splitting; for single lines that are too long, + // fall back to hard-character splitting. All content must reach a chunk — + // never truncate-and-drop. const paragraphs = text.split("\n\n"); const chunks: string[] = []; let current = ""; - for (const para of paragraphs) { - const candidate = current ? `${current}\n\n${para}` : para; - if (candidate.length > maxLength && current) { - chunks.push(current); - current = para.length > maxLength ? para.slice(0, maxLength) : para; - } else if (candidate.length > maxLength) { - // Single paragraph too long, split at line boundaries - const lines = para.split("\n"); - for (const line of lines) { - const lineCandidate = current ? `${current}\n${line}` : line; - if (lineCandidate.length > maxLength && current) { - chunks.push(current); - current = line; - } else { - current = lineCandidate; + const flush = () => { + if (current) { chunks.push(current); current = ""; } + }; + + const pushLines = (para: string) => { + const lines = para.split("\n"); + for (const line of lines) { + if (line.length > maxLength) { + // Single line too long — hard split by characters so nothing is lost. + flush(); + for (let i = 0; i < line.length; i += maxLength) { + chunks.push(line.slice(i, i + maxLength)); } + continue; + } + const lineCandidate = current ? `${current}\n${line}` : line; + if (lineCandidate.length > maxLength) { + flush(); + current = line; + } else { + current = lineCandidate; } + } + }; + + for (const para of paragraphs) { + if (para.length > maxLength) { + // Paragraph alone exceeds the limit — flush whatever we've buffered + // and split this paragraph by lines so the full content survives. + flush(); + pushLines(para); + continue; + } + const candidate = current ? `${current}\n\n${para}` : para; + if (candidate.length > maxLength) { + flush(); + current = para; } else { current = candidate; } } - if (current) chunks.push(current); - return chunks; + flush(); + return balanceCodeFences(chunks); +} + +/** + * Close any open ``` fence at the end of each chunk and re-open it at the + * start of the next (with the same language tag if any). Without this, a long + * fenced block split across Discord messages renders with the continuation + * as raw text. + * + * Implementation: walk fences in order, tracking which fence (if any) is + * currently open via state machine. Whatever's open at the end of a chunk + * is what needs to be re-opened in the next chunk — with one nuance for + * untagged fences: + * + * - TAGGED open (e.g. ```python) at chunk end: always carry forward. + * Losing a language tag across a split is the worst-case outcome. + * + * - UNTAGGED open (bare ```) at chunk end: carry forward ONLY if the chunk + * has content after the trailing fence. A bare ``` as the last non-blank + * line is more likely an LLM emitting a dangling/orphan close (after a + * balanced block) than the start of a new code block. Carrying that + * untagged fence into the next chunk would corrupt any later language + * tag (e.g. prepending ``` in front of a ```python that should stay). + */ +function balanceCodeFences(chunks: string[]): string[] { + // Triple-backtick at start of a line, optionally followed by a language tag. + const FENCE_RE = /^```[a-zA-Z0-9_-]*/gm; + // Anchored variant for "this whole line is a fence" check. + const FENCE_LINE_RE = /^```[a-zA-Z0-9_-]*$/; + + const result: string[] = []; + let pendingOpenFence: string | null = null; + + for (let chunk of chunks) { + if (pendingOpenFence !== null) { + chunk = `${pendingOpenFence}\n${chunk}`; + pendingOpenFence = null; + } + + // Walk fences in order, toggling the currently-open fence. + let openFence: string | null = null; + for (const fence of chunk.match(FENCE_RE) ?? []) { + openFence = openFence === null ? fence : null; + } + + if (openFence !== null) { + // Decide whether to carry forward BEFORE we mutate the chunk. + let shouldCarry: boolean; + if (openFence.length > 3) { + // Tagged open — always carry forward. + shouldCarry = true; + } else { + // Untagged open — only carry if there's content after the trailing + // fence (i.e., the chunk genuinely splits a fenced block mid-content). + const lines = chunk.split("\n"); + let i = lines.length - 1; + while (i >= 0 && lines[i].trim() === "") i--; + shouldCarry = i >= 0 && !FENCE_LINE_RE.test(lines[i]); + } + + // Close the chunk so it renders as a self-contained block either way. + chunk = `${chunk}\n\`\`\``; + if (shouldCarry) pendingOpenFence = openFence; + } + + result.push(chunk); + } + + return result; } /** @deprecated Use renderToolCard instead */