From e03a0b572be426522b2dfabc550b49438eea2ac3 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 13:50:25 -0500 Subject: [PATCH 01/15] implement 8-bit --- graphemes/ansi.go | 76 ++++++++++++++++----------------- graphemes/ansi_test.go | 95 ++++++++++++++++++++---------------------- graphemes/iterator.go | 17 ++++---- 3 files changed, 91 insertions(+), 97 deletions(-) diff --git a/graphemes/ansi.go b/graphemes/ansi.go index 3a038db..8aa9b96 100644 --- a/graphemes/ansi.go +++ b/graphemes/ansi.go @@ -3,23 +3,27 @@ package graphemes // ansiEscapeLength returns the byte length of a valid ANSI escape/control // sequence at the start of data, or 0 if none. // -// Input is UTF-8. This recognizes both: +// This recognizes both: // - 7-bit representations (ESC + final/intermediate bytes), and -// - UTF-8 encodings of 8-bit C1 controls (U+0080..U+009F => 0xC2 0x80..0x9F). +// - 8-bit C1 controls (raw bytes 0x80..0x9F per ECMA-48). // // Recognized forms (ECMA-48 / ISO 6429): -// - CSI: ESC [ then parameter bytes (0x30–0x3F), intermediate (0x20–0x2F), final (0x40–0x7E) -// - OSC: ESC ] then payload until ST (ESC \) or BEL (0x07) -// - DCS, SOS, PM, APC: ESC P / X / ^ / _ then payload until ST (ESC \) +// - CSI: ESC [ (or 0x9B) then parameter bytes (0x30–0x3F), intermediate (0x20–0x2F), final (0x40–0x7E) +// - OSC: ESC ] (or 0x9D) then payload until ST, BEL (0x07), CAN (0x18), or SUB (0x1A) +// - DCS, SOS, PM, APC: ESC P/X/^/_ (or 0x90/0x98/0x9E/0x9F) then payload until ST, CAN, or SUB // - Two-byte: ESC + Fe/Fs (0x40–0x7E excluding above), or Fp (0x30–0x3F), or nF (0x20–0x2F then final) +// - Standalone C1 controls (0x80..0x9F not listed above): single byte func ansiEscapeLength[T ~string | ~[]byte](data T) int { n := len(data) - if n < 2 { + if n == 0 { return 0 } switch data[0] { case esc: + if n < 2 { + return 0 + } b1 := data[1] switch b1 { case '[': // CSI @@ -61,34 +65,31 @@ func ansiEscapeLength[T ~string | ~[]byte](data T) int { return 0 } - case c1UTF8Lead: - b1 := data[1] - if b1 < 0x80 || b1 > 0x9F { + case 0x9B: // C1 CSI + body := csiLength(data[1:]) + if body == 0 { return 0 } + return 1 + body - switch b1 { - case 0x9B: // CSI - body := csiLength(data[2:]) - if body == 0 { - return 0 - } - return 2 + body - case 0x9D: // OSC – allows BEL or ST as terminator - body := oscLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - case 0x90, 0x98, 0x9E, 0x9F: // DCS, SOS, PM, APC – require ST only - body := stSequenceLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - default: - // Any other C1 control (UTF-8 encoded) is one control sequence token. - return 2 + case 0x9D: // C1 OSC + body := oscLength(data[1:]) + if body < 0 { + return 0 + } + return 1 + body + + case 0x90, 0x98, 0x9E, 0x9F: // C1 DCS, SOS, PM, APC + body := stSequenceLength(data[1:]) + if body < 0 { + return 0 + } + return 1 + body + + default: + if data[0] >= 0x80 && data[0] <= 0x9F { + // Any other C1 control is a single-byte sequence. + return 1 } } @@ -132,12 +133,13 @@ func csiLength[T ~string | ~[]byte](data T) int { // - -1: not terminated in the provided data // // OSC accepts BEL (0x07) or ST as terminator by widespread convention. +// ST may be 7-bit (ESC \) or C1 (0x9C). // Per ECMA-48, CAN (0x18) and SUB (0x1A) cancel the control string; in that // case they are not part of the OSC sequence length. func oscLength[T ~string | ~[]byte](data T) int { for i := 0; i < len(data); i++ { b := data[i] - if b == bel { + if b == bel || b == st { return i + 1 } if b == can || b == sub { @@ -146,9 +148,6 @@ func oscLength[T ~string | ~[]byte](data T) int { if b == esc && i+1 < len(data) && data[i+1] == '\\' { return i + 2 } - if b == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C { - return i + 2 - } } return -1 } @@ -161,6 +160,7 @@ func oscLength[T ~string | ~[]byte](data T) int { // - -1: not terminated in the provided data // // Used for DCS, SOS, PM, and APC, which per ECMA-48 terminate with ST. +// ST may be 7-bit (ESC \) or C1 (0x9C). // CAN (0x18) and SUB (0x1A) cancel the control string; in that case they are // not part of the sequence length. func stSequenceLength[T ~string | ~[]byte](data T) int { @@ -168,10 +168,10 @@ func stSequenceLength[T ~string | ~[]byte](data T) int { if data[i] == can || data[i] == sub { return i } - if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' { - return i + 2 + if data[i] == st { + return i + 1 } - if data[i] == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C { + if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' { return i + 2 } } diff --git a/graphemes/ansi_test.go b/graphemes/ansi_test.go index 32b37f1..6d7b434 100644 --- a/graphemes/ansi_test.go +++ b/graphemes/ansi_test.go @@ -158,54 +158,49 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { expected: []string{"\x1b[m", "x"}, }, { - name: "UTF-8 C1 CSI then text", - input: "\xC2\x9B31mhello", - expected: []string{"\xC2\x9B31m", "h", "e", "l", "l", "o"}, + name: "C1 CSI then text", + input: "\x9B31mhello", + expected: []string{"\x9B31m", "h", "e", "l", "l", "o"}, }, { - name: "UTF-8 C1 OSC with UTF-8 C1 ST terminator", - input: "\xC2\x9D0;Title\xC2\x9C", - expected: []string{"\xC2\x9D0;Title\xC2\x9C"}, + name: "C1 OSC with C1 ST terminator", + input: "\x9D0;Title\x9C", + expected: []string{"\x9D0;Title\x9C"}, }, { - name: "UTF-8 C1 OSC with 7-bit ST terminator", - input: "\xC2\x9D0;Title\x1b\\", - expected: []string{"\xC2\x9D0;Title\x1b\\"}, + name: "C1 OSC with 7-bit ST terminator", + input: "\x9D0;Title\x1b\\", + expected: []string{"\x9D0;Title\x1b\\"}, }, { - name: "7-bit OSC with UTF-8 C1 ST terminator", - input: "\x1b]0;Title\xC2\x9C", - expected: []string{"\x1b]0;Title\xC2\x9C"}, + name: "7-bit OSC with C1 ST terminator", + input: "\x1b]0;Title\x9C", + expected: []string{"\x1b]0;Title\x9C"}, }, { - name: "UTF-8 C1 DCS with UTF-8 C1 ST terminator", - input: "\xC2\x90qpayload\xC2\x9C", - expected: []string{"\xC2\x90qpayload\xC2\x9C"}, + name: "C1 DCS with C1 ST terminator", + input: "\x90qpayload\x9C", + expected: []string{"\x90qpayload\x9C"}, }, { - name: "UTF-8 C1 DCS canceled by CAN", - input: "\xC2\x90qpayload\x18x", - expected: []string{"\xC2\x90qpayload", "\x18", "x"}, + name: "C1 DCS canceled by CAN", + input: "\x90qpayload\x18x", + expected: []string{"\x90qpayload", "\x18", "x"}, }, { - name: "UTF-8 C1 DCS with 7-bit ST terminator", - input: "\xC2\x90qpayload\x1b\\", - expected: []string{"\xC2\x90qpayload\x1b\\"}, + name: "C1 DCS with 7-bit ST terminator", + input: "\x90qpayload\x1b\\", + expected: []string{"\x90qpayload\x1b\\"}, }, { - name: "7-bit DCS with UTF-8 C1 ST terminator", - input: "\x1bPqpayload\xC2\x9C", - expected: []string{"\x1bPqpayload\xC2\x9C"}, + name: "7-bit DCS with C1 ST terminator", + input: "\x1bPqpayload\x9C", + expected: []string{"\x1bPqpayload\x9C"}, }, { - name: "UTF-8 C1 Fe IND control", - input: "\xC2\x84", - expected: []string{"\xC2\x84"}, - }, - { - name: "UTF-8 C1 lead byte for non-C1 codepoint is not ANSI", - input: "\u00A9", - expected: []string{"\u00A9"}, + name: "C1 Fe IND control", + input: "\x84", + expected: []string{"\x84"}, }, { name: "nF malformed: no final byte", @@ -228,29 +223,29 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { expected: []string{"\x1b[0 q"}, }, { - name: "UTF-8 C1 OSC unterminated", - input: "\xC2\x9D0;title", - expected: []string{"\xC2\x9D", "0", ";", "t", "i", "t", "l", "e"}, + name: "C1 OSC unterminated", + input: "\x9D0;title", + expected: []string{"\x9D", "0", ";", "t", "i", "t", "l", "e"}, }, { - name: "UTF-8 C1 DCS unterminated", - input: "\xC2\x90data", - expected: []string{"\xC2\x90", "d", "a", "t", "a"}, + name: "C1 DCS unterminated", + input: "\x90data", + expected: []string{"\x90", "d", "a", "t", "a"}, }, { - name: "UTF-8 C1 SOS with UTF-8 C1 ST terminator", - input: "\xC2\x98hello\xC2\x9C", - expected: []string{"\xC2\x98hello\xC2\x9C"}, + name: "C1 SOS with C1 ST terminator", + input: "\x98hello\x9C", + expected: []string{"\x98hello\x9C"}, }, { - name: "UTF-8 C1 PM with 7-bit ST terminator", - input: "\xC2\x9Emsg\x1b\\", - expected: []string{"\xC2\x9Emsg\x1b\\"}, + name: "C1 PM with 7-bit ST terminator", + input: "\x9Emsg\x1b\\", + expected: []string{"\x9Emsg\x1b\\"}, }, { - name: "UTF-8 C1 APC with UTF-8 C1 ST terminator", - input: "\xC2\x9Fdata\xC2\x9C", - expected: []string{"\xC2\x9Fdata\xC2\x9C"}, + name: "C1 APC with C1 ST terminator", + input: "\x9Fdata\x9C", + expected: []string{"\x9Fdata\x9C"}, }, { name: "single ESC byte", @@ -258,9 +253,9 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { expected: []string{"\x1b"}, }, { - name: "single C1 lead byte (incomplete UTF-8)", - input: "\xC2", - expected: []string{"\xC2"}, + name: "single C1 control byte", + input: "\x84", + expected: []string{"\x84"}, }, { name: "SOS canceled by CAN", diff --git a/graphemes/iterator.go b/graphemes/iterator.go index a734657..8494296 100644 --- a/graphemes/iterator.go +++ b/graphemes/iterator.go @@ -38,12 +38,12 @@ var ( ) const ( - esc = 0x1B - cr = 0x0D - bel = 0x07 - can = 0x18 - sub = 0x1A - c1UTF8Lead = 0xC2 + esc = 0x1B + cr = 0x0D + bel = 0x07 + can = 0x18 + sub = 0x1A + st = 0x9C // C1 String Terminator ) // Next advances the iterator to the next grapheme cluster. @@ -54,7 +54,8 @@ func (iter *Iterator[T]) Next() bool { } iter.start = iter.pos - if iter.AnsiEscapeSequences && (iter.data[iter.pos] == esc || iter.data[iter.pos] == c1UTF8Lead) { + b := iter.data[iter.pos] + if iter.AnsiEscapeSequences && (b == esc || (b >= 0x80 && b <= 0x9F)) { if a := ansiEscapeLength(iter.data[iter.pos:]); a > 0 { iter.pos += a return true @@ -62,8 +63,6 @@ func (iter *Iterator[T]) Next() bool { } // ASCII hot path: any ASCII is one grapheme when next byte is ASCII or end. - // Fall through on CR so splitfunc can handle CR+LF as a single cluster. - b := iter.data[iter.pos] if b < utf8.RuneSelf && b != cr { if iter.pos+1 >= len(iter.data) || iter.data[iter.pos+1] < utf8.RuneSelf { iter.pos++ From b8f2c25fff652fe6124b54da05d0f99c888da478 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 13:59:34 -0500 Subject: [PATCH 02/15] add comparative tests --- graphemes/comparative/comparative_test.go | 125 ++++++++++++++++++++++ graphemes/comparative/go.mod | 10 +- graphemes/comparative/go.sum | 10 ++ 3 files changed, 144 insertions(+), 1 deletion(-) diff --git a/graphemes/comparative/comparative_test.go b/graphemes/comparative/comparative_test.go index fa5ac61..9420081 100644 --- a/graphemes/comparative/comparative_test.go +++ b/graphemes/comparative/comparative_test.go @@ -1,9 +1,11 @@ package comparative import ( + "reflect" "strings" "testing" + "github.com/charmbracelet/x/ansi" "github.com/clipperhouse/uax29/v2/graphemes" "github.com/clipperhouse/uax29/v2/testdata" "github.com/rivo/uniseg" @@ -75,3 +77,126 @@ func BenchmarkGraphemesASCII(b *testing.B) { } }) } + +// TestAnsiBoundaryAgreement verifies that our ANSI sequence parsing produces +// the same token boundaries as charmbracelet/x/ansi's DecodeSequence. +// Inputs use ASCII text between sequences so grapheme clustering differences +// don't obscure ANSI boundary comparison. +func TestAnsiBoundaryAgreement(t *testing.T) { + tests := []struct { + name string + input string + }{ + // 7-bit CSI + {"SGR reset", "\x1b[0m"}, + {"SGR color then text then reset", "\x1b[31mhello\x1b[0m"}, + {"CSI bold+color", "\x1b[1;32m"}, + {"CSI cursor position", "\x1b[10;20H"}, + + // 7-bit OSC + {"OSC title with BEL", "\x1b]0;My Title\x07"}, + {"OSC title with ST", "\x1b]0;Title\x1b\\"}, + + // 7-bit DCS/SOS/PM/APC + {"DCS with ST", "\x1bPq#0;2;0;0;0\x1b\\"}, + {"SOS with ST", "\x1bXhello\x1b\\"}, + {"PM with ST", "\x1b^msg\x1b\\"}, + {"APC with ST", "\x1b_data\x1b\\"}, + + // Two-byte Fe/Fs/Fp + {"Fe IND", "\x1bD"}, + {"Fs RIS", "\x1bc"}, + {"Fp DECSC", "\x1b7"}, + + // C1 8-bit + {"C1 CSI then text", "\x9B31mhello"}, + {"C1 OSC with C1 ST", "\x9D0;Title\x9C"}, + {"C1 OSC with 7-bit ST", "\x9D0;Title\x1b\\"}, + {"C1 DCS with 7-bit ST", "\x90qpayload\x1b\\"}, + {"C1 DCS with C1 ST", "\x90qpayload\x9C"}, + {"C1 SOS with C1 ST", "\x98hello\x9C"}, + {"C1 PM with 7-bit ST", "\x9Emsg\x1b\\"}, + {"C1 APC with C1 ST", "\x9Fdata\x9C"}, + + // CSI variants (from charmbracelet test suite) + {"CSI private mode", "\x1b[?1049h"}, + {"CSI subparams (colons)", "\x1b[38:2:255:0:255;1m"}, + {"CSI with intermediate", "\x1b[0 q"}, + {"CSI no params", "\x1b[m"}, + {"CSI mouse click", "\x1b[<0;1;1M"}, + {"CSI mouse wheel", "\x1b[<64;2;11m"}, + {"CSI bracketed paste on", "\x1b[?2004h"}, + {"CSI bracketed paste content", "\x1b[200~pasted text\x1b[201~"}, + + // SS3 / SS2 (Single Shift) + {"SS3 7-bit", "\x1bOA"}, + {"SS3 8-bit", "\x8fA"}, + {"SS2 7-bit", "\x1bNA"}, + {"SS2 8-bit", "\x8eA"}, + + // nF sequences + {"nF charset G0", "\x1b(A"}, + {"nF charset G0 then text", "\x1b(Btext"}, + + // DCS with params + {"DCS with params and C1 ST", "\x1bP0;1|17/ab\x9c"}, + {"C1 DCS with params and C1 ST", "\x90?123;456+q\x9c"}, + + // APC payload (Kitty graphics protocol) + {"APC kitty graphics", "\x1b_Gf=24,s=10,v=20,o=z;aGVsbG8gd29ybGQ=\x1b\\"}, + + // C1 CSI with multiple params + {"C1 CSI multiple params", "\x9B1;2;3m"}, + + // Mixed 7-bit and C1 + {"mixed 7-bit and C1", "\x1b[1m\x9B31mhello\x1b[0m"}, + + // Concatenated sequences + {"concatenated CSI+OSC", "\x1b[1;2;3m\x1b]2;Terminal\x07"}, + {"OSC then CSI", "\x1b]0;Title\x07\x1b[31mred\x1b[0m"}, + + // Text around sequences + {"text around SGR", "hello, \x1b[1;2;3mworld\x1b[0m!"}, + + // Realistic colored output + {"colored ls", "\x1b[1;34mDocuments\x1b[0m \x1b[0;32mbuild.sh\x1b[0m"}, + + // Plain text (no ANSI) + {"plain ASCII", "hello world"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ours := uax29Tokens(tt.input) + theirs := charmTokens(tt.input) + if !reflect.DeepEqual(ours, theirs) { + t.Errorf("boundary mismatch\nours: %q\ntheirs: %q", ours, theirs) + } + }) + } +} + +// uax29Tokens segments the input using our graphemes iterator with ANSI support. +func uax29Tokens(input string) []string { + iter := graphemes.FromString(input) + iter.AnsiEscapeSequences = true + var tokens []string + for iter.Next() { + tokens = append(tokens, iter.Value()) + } + return tokens +} + +// charmTokens segments the input using charmbracelet/x/ansi's DecodeSequence. +func charmTokens(input string) []string { + var state byte + remaining := input + var tokens []string + for len(remaining) > 0 { + seq, _, n, newState := ansi.DecodeSequence(remaining, state, nil) + tokens = append(tokens, seq) + state = newState + remaining = remaining[n:] + } + return tokens +} diff --git a/graphemes/comparative/go.mod b/graphemes/comparative/go.mod index 570fcd1..24fdeec 100644 --- a/graphemes/comparative/go.mod +++ b/graphemes/comparative/go.mod @@ -1,10 +1,18 @@ module github.com/clipperhouse/uax29/graphemes/comparative -go 1.18 +go 1.24.2 require ( github.com/clipperhouse/uax29/v2 v2.6.0 github.com/rivo/uniseg v0.4.7 ) +require ( + github.com/charmbracelet/x/ansi v0.11.6 // indirect + github.com/clipperhouse/displaywidth v0.9.0 // indirect + github.com/clipperhouse/stringish v0.1.1 // indirect + github.com/lucasb-eyer/go-colorful v1.3.0 // indirect + github.com/mattn/go-runewidth v0.0.19 // indirect +) + replace github.com/clipperhouse/uax29/v2 => ../../ diff --git a/graphemes/comparative/go.sum b/graphemes/comparative/go.sum index 9008848..df31c87 100644 --- a/graphemes/comparative/go.sum +++ b/graphemes/comparative/go.sum @@ -1,2 +1,12 @@ +github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8= +github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ= +github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA= +github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA= +github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs= +github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA= +github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag= +github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw= +github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= From bc458abd57de23adbc0b120d632151efb32a7578 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 14:21:56 -0500 Subject: [PATCH 03/15] add benchmarks --- graphemes/comparative/comparative_test.go | 71 +++++++++++++++++++++++ 1 file changed, 71 insertions(+) diff --git a/graphemes/comparative/comparative_test.go b/graphemes/comparative/comparative_test.go index 9420081..512264d 100644 --- a/graphemes/comparative/comparative_test.go +++ b/graphemes/comparative/comparative_test.go @@ -176,6 +176,77 @@ func TestAnsiBoundaryAgreement(t *testing.T) { } } +// ansiSample builds a realistic ANSI-heavy string simulating colored terminal output. +func ansiSample() string { + var b strings.Builder + colors := []string{ + "\x1b[1;34m", // bold blue + "\x1b[0;32m", // green + "\x1b[0;36m", // cyan + "\x1b[1;31m", // bold red + "\x1b[33m", // yellow + } + reset := "\x1b[0m" + lines := []string{ + "drwxr-xr-x 5 user staff 160 Jan 1 12:00 Documents", + "drwxr-xr-x 3 user staff 96 Feb 2 09:30 Downloads", + "-rwxr-xr-x 1 user staff 8432 Mar 15 14:22 build.sh", + "lrwxr-xr-x 1 user staff 11 Apr 20 08:00 config", + "-rw-r--r-- 1 user staff 1024 May 5 16:45 README.md", + } + for round := 0; round < 40; round++ { + for i, line := range lines { + color := colors[i%len(colors)] + if i%5 == 0 { + b.WriteString("\x1b]0;terminal - round ") + b.WriteString(string(rune('0' + round%10))) + b.WriteString("\x07") + } + b.WriteString(color) + b.WriteString(line) + b.WriteString(reset) + b.WriteString("\n") + } + } + return b.String() +} + +func BenchmarkAnsiIteration(b *testing.B) { + input := ansiSample() + n := int64(len(input)) + + b.Run("clipperhouse/uax29", func(b *testing.B) { + b.SetBytes(n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + g := graphemes.FromString(input) + g.AnsiEscapeSequences = true + for g.Next() { + count++ + } + } + }) + + b.Run("charmbracelet/x/ansi", func(b *testing.B) { + b.SetBytes(n) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + var state byte + remaining := input + for len(remaining) > 0 { + _, _, advance, newState := ansi.DecodeSequence(remaining, state, nil) + state = newState + remaining = remaining[advance:] + count++ + } + } + }) +} + // uax29Tokens segments the input using our graphemes iterator with ANSI support. func uax29Tokens(input string) []string { iter := graphemes.FromString(input) From e611f515f5ab490e32911586920866b4153b3bdd Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 14:31:34 -0500 Subject: [PATCH 04/15] comments --- graphemes/README.md | 2 +- graphemes/iterator.go | 9 +++++++-- 2 files changed, 8 insertions(+), 3 deletions(-) diff --git a/graphemes/README.md b/graphemes/README.md index ca146e0..cfd4540 100644 --- a/graphemes/README.md +++ b/graphemes/README.md @@ -86,7 +86,7 @@ for g.Next() { } ``` -We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) C0 and C1 control codes, 7-bit and 8-bit, in UTF-8 encoding. +We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) C0 and C1 control codes, 7-bit and 8-bit. Note that 8-bit control codes are not UTF-8 encoded, and in fact are not valid UTF-8. Caveat emptor. ### Benchmarks diff --git a/graphemes/iterator.go b/graphemes/iterator.go index 8494296..3f18ade 100644 --- a/graphemes/iterator.go +++ b/graphemes/iterator.go @@ -27,8 +27,13 @@ type Iterator[T ~string | ~[]byte] struct { data T pos int start int - // AnsiEscapeSequences treats ANSI escape sequences (ECMA-48) as single grapheme - // clusters when true. Default is false. + // AnsiEscapeSequences treats ANSI escape sequences (ECMA-48) as single + // grapheme clusters when true. The default is false. + // + // This option recognizes 7-bit and 8-bit control codes from ECMA-48. 8-bit + // control codes are not UTF-8 encoded, i.e. not valid UTF-8. If you + // choose this option, you are choosing to interpret non-UTF-8 data, + // caveat emptor. AnsiEscapeSequences bool } From 9fe5eb730f0254fd9486387a28f97fcc81a84401 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 14:41:24 -0500 Subject: [PATCH 05/15] Add some adversarial tests. --- graphemes/ansi_test.go | 72 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/graphemes/ansi_test.go b/graphemes/ansi_test.go index 6d7b434..e288de8 100644 --- a/graphemes/ansi_test.go +++ b/graphemes/ansi_test.go @@ -257,6 +257,26 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { input: "\x84", expected: []string{"\x84"}, }, + { + name: "UTF-8 cafe", + input: "café", + expected: []string{"c", "a", "f", "é"}, + }, + { + name: "UTF-8 Japanese text", + input: "日本語", + expected: []string{"日", "本", "語"}, + }, + { + name: "UTF-8 runes with continuation bytes in C1 range", + input: "Āğל", + expected: []string{"Ā", "ğ", "ל"}, + }, + { + name: "mixed ANSI and UTF-8 adversarial payload", + input: "\x1b[31mĀğ日本語café\x1b[0m", + expected: []string{"\x1b[31m", "Ā", "ğ", "日", "本", "語", "c", "a", "f", "é", "\x1b[0m"}, + }, { name: "SOS canceled by CAN", input: "\x1bXhello\x18z", @@ -304,6 +324,58 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { } } +func TestAnsiEscapeSequencesPureUTF8Parity(t *testing.T) { + t.Parallel() + + samples := []string{ + "café", + "日本語", + "Āğל", + "A\u0301", + "👩🏽‍💻", + "Résumé — 東京 — 👍", + } + + collectString := func(input string, ansi bool) []string { + iter := graphemes.FromString(input) + iter.AnsiEscapeSequences = ansi + var out []string + for iter.Next() { + out = append(out, iter.Value()) + } + return out + } + + collectBytes := func(input string, ansi bool) []string { + iter := graphemes.FromBytes([]byte(input)) + iter.AnsiEscapeSequences = ansi + var out []string + for iter.Next() { + out = append(out, string(iter.Value())) + } + return out + } + + for i, sample := range samples { + sample := sample + t.Run("sample-"+string(rune('A'+i)), func(t *testing.T) { + t.Parallel() + + stringNoANSI := collectString(sample, false) + stringANSI := collectString(sample, true) + if !reflect.DeepEqual(stringNoANSI, stringANSI) { + t.Fatalf("string parity mismatch for %q\noff=%q\non=%q", sample, stringNoANSI, stringANSI) + } + + bytesNoANSI := collectBytes(sample, false) + bytesANSI := collectBytes(sample, true) + if !reflect.DeepEqual(bytesNoANSI, bytesANSI) { + t.Fatalf("bytes parity mismatch for %q\noff=%q\non=%q", sample, bytesNoANSI, bytesANSI) + } + }) + } +} + // ansiSample builds a string that mixes ANSI escape sequences with regular text, // simulating realistic terminal output (colored words, resets, bold, etc.). func ansiSample() string { From 25998bcd4a8005242bc5666339d30fff8e79c008 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 15:46:52 -0500 Subject: [PATCH 06/15] Separate 7-bit and 8-bit options --- graphemes/README.md | 13 +- graphemes/ansi.go | 121 +++++--------- graphemes/ansi8.go | 75 +++++++++ graphemes/ansi_test.go | 183 ++++++++++++++++------ graphemes/comparative/comparative_test.go | 2 +- graphemes/comparative/go.mod | 2 +- graphemes/iterator.go | 23 ++- 7 files changed, 278 insertions(+), 141 deletions(-) create mode 100644 graphemes/ansi8.go diff --git a/graphemes/README.md b/graphemes/README.md index cfd4540..9a64051 100644 --- a/graphemes/README.md +++ b/graphemes/README.md @@ -74,7 +74,7 @@ for g.Next() { // Next() returns true until end of data ### ANSI escape sequences -By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat these sequences as a single cluster, set the `AnsiEscapeSequences` option to true. +By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat 7-bit ANSI escape sequences as a single cluster, set `AnsiEscapeSequences` to true. ```go text := "Hello, \x1b[31mworld\x1b[0m!" @@ -86,7 +86,16 @@ for g.Next() { } ``` -We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) C0 and C1 control codes, 7-bit and 8-bit. Note that 8-bit control codes are not UTF-8 encoded, and in fact are not valid UTF-8. Caveat emptor. +To also parse 8-bit C1 controls (non-UTF-8 bytes), set `AnsiEscapeSequences8Bit` to true. + +```go +g.AnsiEscapeSequences = true // 7-bit forms (ESC ...) +g.AnsiEscapeSequences8Bit = true // 8-bit C1 forms (0x80-0x9F), not valid UTF-8 +``` + +For ESC-initiated (7-bit) control strings, only 7-bit terminators are recognized. + +We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) control codes in both 7-bit and 8-bit representations. 8-bit control codes are not UTF-8 encoded and are not valid UTF-8. ### Benchmarks diff --git a/graphemes/ansi.go b/graphemes/ansi.go index 8aa9b96..ae0c6da 100644 --- a/graphemes/ansi.go +++ b/graphemes/ansi.go @@ -1,96 +1,59 @@ package graphemes -// ansiEscapeLength returns the byte length of a valid ANSI escape/control +// ansiEscapeLength returns the byte length of a valid 7-bit ANSI escape // sequence at the start of data, or 0 if none. // -// This recognizes both: -// - 7-bit representations (ESC + final/intermediate bytes), and -// - 8-bit C1 controls (raw bytes 0x80..0x9F per ECMA-48). -// // Recognized forms (ECMA-48 / ISO 6429): -// - CSI: ESC [ (or 0x9B) then parameter bytes (0x30–0x3F), intermediate (0x20–0x2F), final (0x40–0x7E) -// - OSC: ESC ] (or 0x9D) then payload until ST, BEL (0x07), CAN (0x18), or SUB (0x1A) -// - DCS, SOS, PM, APC: ESC P/X/^/_ (or 0x90/0x98/0x9E/0x9F) then payload until ST, CAN, or SUB -// - Two-byte: ESC + Fe/Fs (0x40–0x7E excluding above), or Fp (0x30–0x3F), or nF (0x20–0x2F then final) -// - Standalone C1 controls (0x80..0x9F not listed above): single byte +// - CSI: ESC [ then parameter bytes (0x30-0x3F), intermediate (0x20-0x2F), final (0x40-0x7E) +// - OSC: ESC ] then payload until BEL (0x07), 7-bit ST (ESC \), CAN (0x18), or SUB (0x1A) +// - DCS, SOS, PM, APC: ESC P/X/^/_ then payload until 7-bit ST (ESC \), CAN, or SUB +// - Two-byte: ESC + Fe/Fs (0x40-0x7E excluding above), or Fp (0x30-0x3F), or nF (0x20-0x2F then final) func ansiEscapeLength[T ~string | ~[]byte](data T) int { n := len(data) - if n == 0 { + if n < 2 || data[0] != esc { return 0 } - switch data[0] { - case esc: - if n < 2 { - return 0 - } - b1 := data[1] - switch b1 { - case '[': // CSI - body := csiLength(data[2:]) - if body == 0 { - return 0 - } - return 2 + body - case ']': // OSC – allows BEL or ST as terminator - body := oscLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - case 'P', 'X', '^', '_': // DCS, SOS, PM, APC – require ST only - body := stSequenceLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - } - if b1 >= 0x40 && b1 <= 0x7E { - // Fe/Fs two-byte; [ ] P X ^ _ handled above - return 2 - } - if b1 >= 0x30 && b1 <= 0x3F { - // Fp (private) two-byte - return 2 - } - if b1 >= 0x20 && b1 <= 0x2F { - // nF: intermediates then one final (0x30–0x7E) - i := 2 - for i < n && data[i] >= 0x20 && data[i] <= 0x2F { - i++ - } - if i < n && data[i] >= 0x30 && data[i] <= 0x7E { - return i + 1 - } - return 0 - } - - case 0x9B: // C1 CSI - body := csiLength(data[1:]) + b1 := data[1] + switch b1 { + case '[': // CSI + body := csiLength(data[2:]) if body == 0 { return 0 } - return 1 + body - - case 0x9D: // C1 OSC - body := oscLength(data[1:]) + return 2 + body + case ']': // OSC - allows BEL or 7-bit ST terminator + body := oscLength(data[2:]) if body < 0 { return 0 } - return 1 + body - - case 0x90, 0x98, 0x9E, 0x9F: // C1 DCS, SOS, PM, APC - body := stSequenceLength(data[1:]) + return 2 + body + case 'P', 'X', '^', '_': // DCS, SOS, PM, APC + body := stSequenceLength(data[2:]) if body < 0 { return 0 } - return 1 + body + return 2 + body + } - default: - if data[0] >= 0x80 && data[0] <= 0x9F { - // Any other C1 control is a single-byte sequence. - return 1 + if b1 >= 0x40 && b1 <= 0x7E { + // Fe/Fs two-byte; [ ] P X ^ _ handled above + return 2 + } + if b1 >= 0x30 && b1 <= 0x3F { + // Fp (private) two-byte + return 2 + } + if b1 >= 0x20 && b1 <= 0x2F { + // nF: intermediates then one final (0x30-0x7E) + i := 2 + for i < n && data[i] >= 0x20 && data[i] <= 0x2F { + i++ + } + if i < n && data[i] >= 0x30 && data[i] <= 0x7E { + return i + 1 } + return 0 } return 0 @@ -126,20 +89,19 @@ func csiLength[T ~string | ~[]byte](data T) int { } // oscLength returns the length of the OSC body. -// data is the slice after "ESC ]" (or C1 OSC). +// data is the slice after "ESC ]". // // Returns: // - n >= 0: consumed body length (includes BEL/ST terminator when present) // - -1: not terminated in the provided data // -// OSC accepts BEL (0x07) or ST as terminator by widespread convention. -// ST may be 7-bit (ESC \) or C1 (0x9C). +// OSC accepts BEL (0x07) or 7-bit ST (ESC \) as terminators by widespread convention. // Per ECMA-48, CAN (0x18) and SUB (0x1A) cancel the control string; in that // case they are not part of the OSC sequence length. func oscLength[T ~string | ~[]byte](data T) int { for i := 0; i < len(data); i++ { b := data[i] - if b == bel || b == st { + if b == bel { return i + 1 } if b == can || b == sub { @@ -153,14 +115,14 @@ func oscLength[T ~string | ~[]byte](data T) int { } // stSequenceLength returns the length of a control-string body. -// data is the slice after "ESC x" (or C1 DCS/SOS/PM/APC). +// data is the slice after "ESC x". // // Returns: // - n >= 0: consumed body length (includes ST terminator when present) // - -1: not terminated in the provided data // // Used for DCS, SOS, PM, and APC, which per ECMA-48 terminate with ST. -// ST may be 7-bit (ESC \) or C1 (0x9C). +// ST here is the 7-bit form (ESC \). // CAN (0x18) and SUB (0x1A) cancel the control string; in that case they are // not part of the sequence length. func stSequenceLength[T ~string | ~[]byte](data T) int { @@ -168,9 +130,6 @@ func stSequenceLength[T ~string | ~[]byte](data T) int { if data[i] == can || data[i] == sub { return i } - if data[i] == st { - return i + 1 - } if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' { return i + 2 } diff --git a/graphemes/ansi8.go b/graphemes/ansi8.go new file mode 100644 index 0000000..97d59c9 --- /dev/null +++ b/graphemes/ansi8.go @@ -0,0 +1,75 @@ +package graphemes + +// ansiEscapeLength8Bit returns the byte length of a valid 8-bit C1 ANSI +// sequence at the start of data, or 0 if none. +// +// Recognized forms (ECMA-48 / ISO 6429): +// - C1 CSI (0x9B) body as parameter/intermediate/final bytes +// - C1 OSC (0x9D) body terminated by BEL, C1 ST, 7-bit ST, CAN, or SUB +// - C1 DCS/SOS/PM/APC (0x90/0x98/0x9E/0x9F) body terminated by C1 ST, 7-bit ST, CAN, or SUB +// - Standalone C1 controls (0x80..0x9F not listed above): single byte +func ansiEscapeLength8Bit[T ~string | ~[]byte](data T) int { + if len(data) == 0 { + return 0 + } + + switch data[0] { + case 0x9B: // C1 CSI + body := csiLength(data[1:]) + if body == 0 { + return 0 + } + return 1 + body + case 0x9D: // C1 OSC + body := oscLengthC1(data[1:]) + if body < 0 { + return 0 + } + return 1 + body + case 0x90, 0x98, 0x9E, 0x9F: // C1 DCS, SOS, PM, APC + body := stSequenceLengthC1(data[1:]) + if body < 0 { + return 0 + } + return 1 + body + default: + if data[0] >= 0x80 && data[0] <= 0x9F { + return 1 + } + } + + return 0 +} + +func oscLengthC1[T ~string | ~[]byte](data T) int { + for i := 0; i < len(data); i++ { + b := data[i] + if b == bel || b == st { + return i + 1 + } + if b == can || b == sub { + return i + } + if b == esc && i+1 < len(data) && data[i+1] == '\\' { + return i + 2 + } + } + return -1 +} + +// stSequenceLengthC1 parses DCS/SOS/PM/APC bodies that may +// terminate with either 7-bit ST (ESC \) or C1 ST (0x9C). +func stSequenceLengthC1[T ~string | ~[]byte](data T) int { + for i := 0; i < len(data); i++ { + if data[i] == can || data[i] == sub { + return i + } + if data[i] == st { + return i + 1 + } + if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' { + return i + 2 + } + } + return -1 +} diff --git a/graphemes/ansi_test.go b/graphemes/ansi_test.go index e288de8..df33564 100644 --- a/graphemes/ansi_test.go +++ b/graphemes/ansi_test.go @@ -9,14 +9,104 @@ import ( "github.com/clipperhouse/uax29/v2/testdata" ) -func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { +type ansiCase struct { + name string + input string + expected []string +} + +func assertANSITokens(t *testing.T, input string, expected []string, sevenBit, eightBit bool) { + t.Helper() + + assertEqual := func(kind string, got []string) { + t.Helper() + if !reflect.DeepEqual(got, expected) { + t.Errorf("%s mismatch\ngot %q\nexpected %q", kind, got, expected) + } + } + + iterString := graphemes.FromString(input) + iterString.AnsiEscapeSequences = sevenBit + iterString.AnsiEscapeSequences8Bit = eightBit + var gotString []string + for iterString.Next() { + gotString = append(gotString, iterString.Value()) + } + assertEqual("string", gotString) + + iterBytes := graphemes.FromBytes([]byte(input)) + iterBytes.AnsiEscapeSequences = sevenBit + iterBytes.AnsiEscapeSequences8Bit = eightBit + var gotBytes []string + for iterBytes.Next() { + gotBytes = append(gotBytes, string(iterBytes.Value())) + } + assertEqual("bytes", gotBytes) +} + +func runANSICases(t *testing.T, tests []ansiCase, sevenBit, eightBit bool) { + t.Helper() + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assertANSITokens(t, tt.input, tt.expected, sevenBit, eightBit) + }) + } +} + +func TestAnsiEscapeSequences7BitOnlyAsGraphemes(t *testing.T) { + t.Parallel() + + tests := []ansiCase{ + {name: "SGR reset", input: "\x1b[0m", expected: []string{"\x1b[0m"}}, + {name: "SGR red then text", input: "\x1b[31mhello", expected: []string{"\x1b[31m", "h", "e", "l", "l", "o"}}, + {name: "CSI with valid intermediate", input: "\x1b[0 q", expected: []string{"\x1b[0 q"}}, + {name: "OSC window title then BEL", input: "\x1b]0;My Title\x07", expected: []string{"\x1b]0;My Title\x07"}}, + {name: "OSC window title then ST", input: "\x1b]0;Title\x1b\\", expected: []string{"\x1b]0;Title\x1b\\"}}, + {name: "DCS with ST terminator", input: "\x1bPq#0;2;0;0;0\x1b\\", expected: []string{"\x1bPq#0;2;0;0;0\x1b\\"}}, + {name: "DCS canceled by CAN", input: "\x1bPqdata\x18z", expected: []string{"\x1bPqdata", "\x18", "z"}}, + {name: "SOS with ST terminator", input: "\x1bXhello\x1b\\", expected: []string{"\x1bXhello\x1b\\"}}, + {name: "PM with ST terminator", input: "\x1b^msg\x1b\\", expected: []string{"\x1b^msg\x1b\\"}}, + {name: "APC with ST terminator", input: "\x1b_data\x1b\\", expected: []string{"\x1b_data\x1b\\"}}, + {name: "two-byte Fe", input: "\x1bD", expected: []string{"\x1bD"}}, + {name: "two-byte Fp", input: "\x1b7", expected: []string{"\x1b7"}}, + {name: "nF with multiple intermediates", input: "\x1b !Fx", expected: []string{"\x1b !F", "x"}}, + {name: "malformed CSI remains split", input: "\x1b[ 1mok", expected: []string{"\x1b", "[", " ", "1", "m", "o", "k"}}, + {name: "C1 CSI is not parsed", input: "\x9B31mhello", expected: []string{"\x9B", "3", "1", "m", "h", "e", "l", "l", "o"}}, + {name: "7-bit OSC does not accept C1 ST", input: "\x1b]0;Title\x9Cz", expected: []string{"\x1b", "]", "0", ";", "T", "i", "t", "l", "e", "\x9C", "z"}}, + } + + runANSICases(t, tests, true, false) +} + +func TestAnsiEscapeSequences8BitOnlyAsGraphemes(t *testing.T) { t.Parallel() - tests := []struct { - name string - input string - expected []string - }{ + tests := []ansiCase{ + {name: "C1 CSI then text", input: "\x9B31mhello", expected: []string{"\x9B31m", "h", "e", "l", "l", "o"}}, + {name: "C1 CSI multiple params", input: "\x9B1;2;3m", expected: []string{"\x9B1;2;3m"}}, + {name: "C1 OSC with C1 ST", input: "\x9D0;Title\x9C", expected: []string{"\x9D0;Title\x9C"}}, + {name: "C1 OSC with 7-bit ST", input: "\x9D0;Title\x1b\\", expected: []string{"\x9D0;Title\x1b\\"}}, + {name: "C1 DCS with C1 ST", input: "\x90qpayload\x9C", expected: []string{"\x90qpayload\x9C"}}, + {name: "C1 DCS with 7-bit ST", input: "\x90qpayload\x1b\\", expected: []string{"\x90qpayload\x1b\\"}}, + {name: "C1 DCS canceled by CAN", input: "\x90qpayload\x18x", expected: []string{"\x90qpayload", "\x18", "x"}}, + {name: "C1 SOS with C1 ST", input: "\x98hello\x9C", expected: []string{"\x98hello\x9C"}}, + {name: "C1 PM with 7-bit ST", input: "\x9Emsg\x1b\\", expected: []string{"\x9Emsg\x1b\\"}}, + {name: "C1 APC with C1 ST", input: "\x9Fdata\x9C", expected: []string{"\x9Fdata\x9C"}}, + {name: "single C1 Fe control", input: "\x84", expected: []string{"\x84"}}, + {name: "C1 OSC unterminated", input: "\x9D0;title", expected: []string{"\x9D", "0", ";", "t", "i", "t", "l", "e"}}, + {name: "C1 DCS unterminated", input: "\x90data", expected: []string{"\x90", "d", "a", "t", "a"}}, + {name: "7-bit ESC sequence is not parsed", input: "\x1b[31mhello", expected: []string{"\x1b", "[", "3", "1", "m", "h", "e", "l", "l", "o"}}, + } + + runANSICases(t, tests, false, true) +} + +func TestAnsiEscapeSequencesBothEnabledAsGraphemes(t *testing.T) { + t.Parallel() + + tests := []ansiCase{ { name: "SGR reset", input: "\x1b[0m", @@ -37,6 +127,11 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { input: "\x1b]0;My Title\x07", expected: []string{"\x1b]0;My Title\x07"}, }, + { + name: "OSC UTF-8 payload does not terminate at continuation byte", + input: "\x1b]0;本\x07", + expected: []string{"\x1b]0;本\x07"}, + }, { name: "OSC window title then ST", input: "\x1b]0;Title\x1b\\", @@ -47,6 +142,11 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { input: "\x1bPq#0;2;0;0;0\x1b\\", expected: []string{"\x1bPq#0;2;0;0;0\x1b\\"}, }, + { + name: "DCS UTF-8 payload does not terminate at continuation byte", + input: "\x1bPq本\x1b\\", + expected: []string{"\x1bPq本\x1b\\"}, + }, { name: "DCS with BEL in payload is not a single sequence", input: "\x1bPq\x07rest", @@ -175,7 +275,7 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { { name: "7-bit OSC with C1 ST terminator", input: "\x1b]0;Title\x9C", - expected: []string{"\x1b]0;Title\x9C"}, + expected: []string{"\x1b", "]", "0", ";", "T", "i", "t", "l", "e", "\x9C"}, }, { name: "C1 DCS with C1 ST terminator", @@ -195,7 +295,7 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { { name: "7-bit DCS with C1 ST terminator", input: "\x1bPqpayload\x9C", - expected: []string{"\x1bPqpayload\x9C"}, + expected: []string{"\x1b", "P", "q", "p", "a", "y", "l", "o", "a", "d", "\x9C"}, }, { name: "C1 Fe IND control", @@ -294,34 +394,7 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { }, } - for _, tt := range tests { - tt := tt - t.Run(tt.name, func(t *testing.T) { - t.Parallel() - assertEqual := func(kind string, got []string) { - t.Helper() - if !reflect.DeepEqual(got, tt.expected) { - t.Errorf("%s mismatch\ngot %q\nexpected %q", kind, got, tt.expected) - } - } - - iterString := graphemes.FromString(tt.input) - iterString.AnsiEscapeSequences = true - var gotString []string - for iterString.Next() { - gotString = append(gotString, iterString.Value()) - } - assertEqual("string", gotString) - - iterBytes := graphemes.FromBytes([]byte(tt.input)) - iterBytes.AnsiEscapeSequences = true - var gotBytes []string - for iterBytes.Next() { - gotBytes = append(gotBytes, string(iterBytes.Value())) - } - assertEqual("bytes", gotBytes) - }) - } + runANSICases(t, tests, true, true) } func TestAnsiEscapeSequencesPureUTF8Parity(t *testing.T) { @@ -336,9 +409,10 @@ func TestAnsiEscapeSequencesPureUTF8Parity(t *testing.T) { "Résumé — 東京 — 👍", } - collectString := func(input string, ansi bool) []string { + collectString := func(input string, ansi7, ansi8 bool) []string { iter := graphemes.FromString(input) - iter.AnsiEscapeSequences = ansi + iter.AnsiEscapeSequences = ansi7 + iter.AnsiEscapeSequences8Bit = ansi8 var out []string for iter.Next() { out = append(out, iter.Value()) @@ -346,9 +420,10 @@ func TestAnsiEscapeSequencesPureUTF8Parity(t *testing.T) { return out } - collectBytes := func(input string, ansi bool) []string { + collectBytes := func(input string, ansi7, ansi8 bool) []string { iter := graphemes.FromBytes([]byte(input)) - iter.AnsiEscapeSequences = ansi + iter.AnsiEscapeSequences = ansi7 + iter.AnsiEscapeSequences8Bit = ansi8 var out []string for iter.Next() { out = append(out, string(iter.Value())) @@ -361,16 +436,26 @@ func TestAnsiEscapeSequencesPureUTF8Parity(t *testing.T) { t.Run("sample-"+string(rune('A'+i)), func(t *testing.T) { t.Parallel() - stringNoANSI := collectString(sample, false) - stringANSI := collectString(sample, true) - if !reflect.DeepEqual(stringNoANSI, stringANSI) { - t.Fatalf("string parity mismatch for %q\noff=%q\non=%q", sample, stringNoANSI, stringANSI) - } + stringBase := collectString(sample, false, false) + for _, flags := range []struct { + name string + ansi7 bool + ansi8 bool + }{ + {name: "7-bit only", ansi7: true, ansi8: false}, + {name: "8-bit only", ansi7: false, ansi8: true}, + {name: "both", ansi7: true, ansi8: true}, + } { + gotString := collectString(sample, flags.ansi7, flags.ansi8) + if !reflect.DeepEqual(stringBase, gotString) { + t.Fatalf("string parity mismatch for %q (%s)\noff=%q\non=%q", sample, flags.name, stringBase, gotString) + } - bytesNoANSI := collectBytes(sample, false) - bytesANSI := collectBytes(sample, true) - if !reflect.DeepEqual(bytesNoANSI, bytesANSI) { - t.Fatalf("bytes parity mismatch for %q\noff=%q\non=%q", sample, bytesNoANSI, bytesANSI) + bytesBase := collectBytes(sample, false, false) + gotBytes := collectBytes(sample, flags.ansi7, flags.ansi8) + if !reflect.DeepEqual(bytesBase, gotBytes) { + t.Fatalf("bytes parity mismatch for %q (%s)\noff=%q\non=%q", sample, flags.name, bytesBase, gotBytes) + } } }) } diff --git a/graphemes/comparative/comparative_test.go b/graphemes/comparative/comparative_test.go index 512264d..11e6fb1 100644 --- a/graphemes/comparative/comparative_test.go +++ b/graphemes/comparative/comparative_test.go @@ -139,7 +139,6 @@ func TestAnsiBoundaryAgreement(t *testing.T) { {"nF charset G0 then text", "\x1b(Btext"}, // DCS with params - {"DCS with params and C1 ST", "\x1bP0;1|17/ab\x9c"}, {"C1 DCS with params and C1 ST", "\x90?123;456+q\x9c"}, // APC payload (Kitty graphics protocol) @@ -251,6 +250,7 @@ func BenchmarkAnsiIteration(b *testing.B) { func uax29Tokens(input string) []string { iter := graphemes.FromString(input) iter.AnsiEscapeSequences = true + iter.AnsiEscapeSequences8Bit = true var tokens []string for iter.Next() { tokens = append(tokens, iter.Value()) diff --git a/graphemes/comparative/go.mod b/graphemes/comparative/go.mod index 24fdeec..33c156e 100644 --- a/graphemes/comparative/go.mod +++ b/graphemes/comparative/go.mod @@ -3,12 +3,12 @@ module github.com/clipperhouse/uax29/graphemes/comparative go 1.24.2 require ( + github.com/charmbracelet/x/ansi v0.11.6 github.com/clipperhouse/uax29/v2 v2.6.0 github.com/rivo/uniseg v0.4.7 ) require ( - github.com/charmbracelet/x/ansi v0.11.6 // indirect github.com/clipperhouse/displaywidth v0.9.0 // indirect github.com/clipperhouse/stringish v0.1.1 // indirect github.com/lucasb-eyer/go-colorful v1.3.0 // indirect diff --git a/graphemes/iterator.go b/graphemes/iterator.go index 3f18ade..9266757 100644 --- a/graphemes/iterator.go +++ b/graphemes/iterator.go @@ -27,14 +27,17 @@ type Iterator[T ~string | ~[]byte] struct { data T pos int start int - // AnsiEscapeSequences treats ANSI escape sequences (ECMA-48) as single - // grapheme clusters when true. The default is false. + // AnsiEscapeSequences treats 7-bit ANSI escape sequences (ECMA-48) as + // single grapheme clusters when true. The default is false. // - // This option recognizes 7-bit and 8-bit control codes from ECMA-48. 8-bit - // control codes are not UTF-8 encoded, i.e. not valid UTF-8. If you - // choose this option, you are choosing to interpret non-UTF-8 data, - // caveat emptor. + // 8-bit controls are not enabled by this option. See AnsiEscapeSequences8Bit. AnsiEscapeSequences bool + // AnsiEscapeSequences8Bit treats 8-bit C1 control codes (ECMA-48) as single + // grapheme clusters when true. The default is false. + // + // 8-bit control bytes are not UTF-8 encoded, i.e. not valid UTF-8. If you + // choose this option, you are choosing to interpret non-UTF-8 data. + AnsiEscapeSequences8Bit bool } var ( @@ -60,12 +63,18 @@ func (iter *Iterator[T]) Next() bool { iter.start = iter.pos b := iter.data[iter.pos] - if iter.AnsiEscapeSequences && (b == esc || (b >= 0x80 && b <= 0x9F)) { + if iter.AnsiEscapeSequences && b == esc { if a := ansiEscapeLength(iter.data[iter.pos:]); a > 0 { iter.pos += a return true } } + if iter.AnsiEscapeSequences8Bit && b >= 0x80 && b <= 0x9F { + if a := ansiEscapeLength8Bit(iter.data[iter.pos:]); a > 0 { + iter.pos += a + return true + } + } // ASCII hot path: any ASCII is one grapheme when next byte is ASCII or end. if b < utf8.RuneSelf && b != cr { From 5740126778ae5af6c39dcf05cefde5ee26a691d1 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 15:54:29 -0500 Subject: [PATCH 07/15] more benchmarks --- graphemes/README.md | 2 +- graphemes/ansi_test.go | 140 ++++++++++++++++++++-- graphemes/comparative/comparative_test.go | 80 +++++++++++-- graphemes/iterator.go | 5 +- 4 files changed, 208 insertions(+), 19 deletions(-) diff --git a/graphemes/README.md b/graphemes/README.md index 9a64051..d58ae69 100644 --- a/graphemes/README.md +++ b/graphemes/README.md @@ -95,7 +95,7 @@ g.AnsiEscapeSequences8Bit = true // 8-bit C1 forms (0x80-0x9F), not valid UTF-8 For ESC-initiated (7-bit) control strings, only 7-bit terminators are recognized. -We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) control codes in both 7-bit and 8-bit representations. 8-bit control codes are not UTF-8 encoded and are not valid UTF-8. +We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) control codes in both 7-bit and 8-bit representations. 8-bit control codes are not UTF-8 encoded and are not valid UTF-8, caveat emptor. ### Benchmarks diff --git a/graphemes/ansi_test.go b/graphemes/ansi_test.go index df33564..e42d018 100644 --- a/graphemes/ansi_test.go +++ b/graphemes/ansi_test.go @@ -508,21 +508,129 @@ func ansiSample() string { return b.String() } +// ansiSample8Bit builds a string that uses 8-bit C1 initiators. +func ansiSample8Bit() string { + var b strings.Builder + + lines := []string{ + "drwxr-xr-x 5 user staff 160 Jan 1 12:00 Documents", + "drwxr-xr-x 3 user staff 96 Feb 2 09:30 Downloads", + "-rwxr-xr-x 1 user staff 8432 Mar 15 14:22 build.sh", + "lrwxr-xr-x 1 user staff 11 Apr 20 08:00 config -> /etc/config", + "-rw-r--r-- 1 user staff 1024 May 5 16:45 README.md", + "total 42", + "drwxr-xr-x 2 user staff 64 Jun 10 11:11 src", + "-rw-r--r-- 1 user staff 512 Jul 7 07:07 main.go", + "error: file not found: missing.txt", + "warning: deprecated function used in line 42", + } + + for round := 0; round < 20; round++ { + for i, line := range lines { + // C1 OSC: 0x9D ... BEL + if i%5 == 0 { + b.WriteByte(0x9D) + b.WriteString("0;terminal - round ") + b.WriteString(string(rune('0' + round%10))) + b.WriteByte(0x07) + } + // C1 CSI SGR: 0x9B ... m + b.WriteByte(0x9B) + b.WriteString("1;3") + b.WriteString(string(rune('0' + (i % 8)))) + b.WriteByte('m') + b.WriteString(line) + // C1 CSI reset: 0x9B0m + b.WriteByte(0x9B) + b.WriteString("0m") + b.WriteByte('\n') + } + } + return b.String() +} + +// ansiSampleMixed builds a string with both 7-bit and 8-bit ANSI forms. +func ansiSampleMixed() string { + var b strings.Builder + a7 := ansiSample() + a8 := ansiSample8Bit() + b.WriteString(a7) + b.WriteString(a8) + return b.String() +} + // BenchmarkAnsiOption benchmarks the iterator on text that contains ANSI escapes, // and on plain text, with the AnsiEscapeSequences option on and off. func BenchmarkAnsiOption(b *testing.B) { - ansi := ansiSample() + ansi7 := ansiSample() + ansi8 := ansiSample8Bit() + ansiMixed := ansiSampleMixed() plain, err := testdata.Sample() if err != nil { b.Fatal(err) } plainStr := string(plain) - b.Run("AnsiText/OptionOn", func(b *testing.B) { - b.SetBytes(int64(len(ansi))) + b.Run("AnsiText7Bit/Option7BitOn", func(b *testing.B) { + b.SetBytes(int64(len(ansi7))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(ansi7) + iter.AnsiEscapeSequences = true + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("AnsiText7Bit/OptionOff", func(b *testing.B) { + b.SetBytes(int64(len(ansi7))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(ansi7) + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("AnsiText8Bit/Option8BitOn", func(b *testing.B) { + b.SetBytes(int64(len(ansi8))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(ansi8) + iter.AnsiEscapeSequences8Bit = true + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("AnsiText8Bit/OptionOff", func(b *testing.B) { + b.SetBytes(int64(len(ansi8))) for i := 0; i < b.N; i++ { - iter := graphemes.FromString(ansi) + iter := graphemes.FromString(ansi8) + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("AnsiTextMixed/BothOptionsOn", func(b *testing.B) { + b.SetBytes(int64(len(ansiMixed))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(ansiMixed) iter.AnsiEscapeSequences = true + iter.AnsiEscapeSequences8Bit = true c := 0 for iter.Next() { _ = iter.Value() @@ -532,10 +640,25 @@ func BenchmarkAnsiOption(b *testing.B) { } }) - b.Run("AnsiText/OptionOff", func(b *testing.B) { - b.SetBytes(int64(len(ansi))) + b.Run("PlainText/Option7BitOn", func(b *testing.B) { + b.SetBytes(int64(len(plainStr))) for i := 0; i < b.N; i++ { - iter := graphemes.FromString(ansi) + iter := graphemes.FromString(plainStr) + iter.AnsiEscapeSequences = true + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("PlainText/Option8BitOn", func(b *testing.B) { + b.SetBytes(int64(len(plainStr))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(plainStr) + iter.AnsiEscapeSequences8Bit = true c := 0 for iter.Next() { _ = iter.Value() @@ -545,11 +668,12 @@ func BenchmarkAnsiOption(b *testing.B) { } }) - b.Run("PlainText/OptionOn", func(b *testing.B) { + b.Run("PlainText/BothOptionsOn", func(b *testing.B) { b.SetBytes(int64(len(plainStr))) for i := 0; i < b.N; i++ { iter := graphemes.FromString(plainStr) iter.AnsiEscapeSequences = true + iter.AnsiEscapeSequences8Bit = true c := 0 for iter.Next() { _ = iter.Value() diff --git a/graphemes/comparative/comparative_test.go b/graphemes/comparative/comparative_test.go index 11e6fb1..f91de82 100644 --- a/graphemes/comparative/comparative_test.go +++ b/graphemes/comparative/comparative_test.go @@ -210,17 +210,52 @@ func ansiSample() string { return b.String() } +func ansiSample8Bit() string { + var b strings.Builder + lines := []string{ + "drwxr-xr-x 5 user staff 160 Jan 1 12:00 Documents", + "drwxr-xr-x 3 user staff 96 Feb 2 09:30 Downloads", + "-rwxr-xr-x 1 user staff 8432 Mar 15 14:22 build.sh", + "lrwxr-xr-x 1 user staff 11 Apr 20 08:00 config", + "-rw-r--r-- 1 user staff 1024 May 5 16:45 README.md", + } + for round := 0; round < 40; round++ { + for i, line := range lines { + if i%5 == 0 { + b.WriteByte(0x9D) + b.WriteString("0;terminal - round ") + b.WriteString(string(rune('0' + round%10))) + b.WriteByte(0x07) + } + b.WriteByte(0x9B) + b.WriteString("1;3") + b.WriteString(string(rune('0' + (i % 8)))) + b.WriteByte('m') + b.WriteString(line) + b.WriteByte(0x9B) + b.WriteString("0m") + b.WriteString("\n") + } + } + return b.String() +} + +func ansiSampleMixed() string { + return ansiSample() + ansiSample8Bit() +} + func BenchmarkAnsiIteration(b *testing.B) { - input := ansiSample() - n := int64(len(input)) + input7 := ansiSample() + input8 := ansiSample8Bit() + inputMixed := ansiSampleMixed() - b.Run("clipperhouse/uax29", func(b *testing.B) { - b.SetBytes(n) + b.Run("clipperhouse/uax29/7bit", func(b *testing.B) { + b.SetBytes(int64(len(input7))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { count := 0 - g := graphemes.FromString(input) + g := graphemes.FromString(input7) g.AnsiEscapeSequences = true for g.Next() { count++ @@ -228,14 +263,43 @@ func BenchmarkAnsiIteration(b *testing.B) { } }) - b.Run("charmbracelet/x/ansi", func(b *testing.B) { - b.SetBytes(n) + b.Run("clipperhouse/uax29/8bit", func(b *testing.B) { + b.SetBytes(int64(len(input8))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + g := graphemes.FromString(input8) + g.AnsiEscapeSequences8Bit = true + for g.Next() { + count++ + } + } + }) + + b.Run("clipperhouse/uax29/both", func(b *testing.B) { + b.SetBytes(int64(len(inputMixed))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + g := graphemes.FromString(inputMixed) + g.AnsiEscapeSequences = true + g.AnsiEscapeSequences8Bit = true + for g.Next() { + count++ + } + } + }) + + b.Run("charmbracelet/x/ansi/mixed", func(b *testing.B) { + b.SetBytes(int64(len(inputMixed))) b.ReportAllocs() b.ResetTimer() for i := 0; i < b.N; i++ { count := 0 var state byte - remaining := input + remaining := inputMixed for len(remaining) > 0 { _, _, advance, newState := ansi.DecodeSequence(remaining, state, nil) state = newState diff --git a/graphemes/iterator.go b/graphemes/iterator.go index 9266757..a3f9aac 100644 --- a/graphemes/iterator.go +++ b/graphemes/iterator.go @@ -30,13 +30,14 @@ type Iterator[T ~string | ~[]byte] struct { // AnsiEscapeSequences treats 7-bit ANSI escape sequences (ECMA-48) as // single grapheme clusters when true. The default is false. // - // 8-bit controls are not enabled by this option. See AnsiEscapeSequences8Bit. + // 8-bit controls are not enabled by this option. See [AnsiEscapeSequences8Bit]. AnsiEscapeSequences bool // AnsiEscapeSequences8Bit treats 8-bit C1 control codes (ECMA-48) as single // grapheme clusters when true. The default is false. // // 8-bit control bytes are not UTF-8 encoded, i.e. not valid UTF-8. If you - // choose this option, you are choosing to interpret non-UTF-8 data. + // choose this option, you are choosing to interpret non-UTF-8 data, caveat + // emptor. AnsiEscapeSequences8Bit bool } From e40113c3ac9c9cb71c9a14cd4be70f7879fd8d21 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 16:32:52 -0500 Subject: [PATCH 08/15] Stricter 8-bit terminators --- graphemes/README.md | 1 + graphemes/ansi.go | 6 +++--- graphemes/ansi8.go | 16 +++++----------- graphemes/ansi_test.go | 12 ++++++------ graphemes/comparative/comparative_test.go | 3 --- 5 files changed, 15 insertions(+), 23 deletions(-) diff --git a/graphemes/README.md b/graphemes/README.md index d58ae69..3f8a5e3 100644 --- a/graphemes/README.md +++ b/graphemes/README.md @@ -94,6 +94,7 @@ g.AnsiEscapeSequences8Bit = true // 8-bit C1 forms (0x80-0x9F), not valid UTF-8 ``` For ESC-initiated (7-bit) control strings, only 7-bit terminators are recognized. +For C1-initiated (8-bit) control strings, only C1 ST (`0x9C`) is recognized as ST. We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) control codes in both 7-bit and 8-bit representations. 8-bit control codes are not UTF-8 encoded and are not valid UTF-8, caveat emptor. diff --git a/graphemes/ansi.go b/graphemes/ansi.go index ae0c6da..9cd09b4 100644 --- a/graphemes/ansi.go +++ b/graphemes/ansi.go @@ -17,7 +17,7 @@ func ansiEscapeLength[T ~string | ~[]byte](data T) int { b1 := data[1] switch b1 { case '[': // CSI - body := csiLength(data[2:]) + body := csiBodyLength(data[2:]) if body == 0 { return 0 } @@ -59,14 +59,14 @@ func ansiEscapeLength[T ~string | ~[]byte](data T) int { return 0 } -// csiLength returns the length of the CSI body (param/intermediate/final bytes). +// csiBodyLength returns the length of the CSI body (param/intermediate/final bytes). // data is the slice after "ESC [". // Per ECMA-48, the CSI body has the form: // // parameters (0x30–0x3F)*, intermediates (0x20–0x2F)*, final (0x40–0x7E) // // Once an intermediate byte is seen, subsequent parameter bytes are invalid. -func csiLength[T ~string | ~[]byte](data T) int { +func csiBodyLength[T ~string | ~[]byte](data T) int { seenIntermediate := false for i := 0; i < len(data); i++ { b := data[i] diff --git a/graphemes/ansi8.go b/graphemes/ansi8.go index 97d59c9..a5fde3b 100644 --- a/graphemes/ansi8.go +++ b/graphemes/ansi8.go @@ -5,8 +5,8 @@ package graphemes // // Recognized forms (ECMA-48 / ISO 6429): // - C1 CSI (0x9B) body as parameter/intermediate/final bytes -// - C1 OSC (0x9D) body terminated by BEL, C1 ST, 7-bit ST, CAN, or SUB -// - C1 DCS/SOS/PM/APC (0x90/0x98/0x9E/0x9F) body terminated by C1 ST, 7-bit ST, CAN, or SUB +// - C1 OSC (0x9D) body terminated by BEL, C1 ST, CAN, or SUB +// - C1 DCS/SOS/PM/APC (0x90/0x98/0x9E/0x9F) body terminated by C1 ST, CAN, or SUB // - Standalone C1 controls (0x80..0x9F not listed above): single byte func ansiEscapeLength8Bit[T ~string | ~[]byte](data T) int { if len(data) == 0 { @@ -15,7 +15,7 @@ func ansiEscapeLength8Bit[T ~string | ~[]byte](data T) int { switch data[0] { case 0x9B: // C1 CSI - body := csiLength(data[1:]) + body := csiBodyLength(data[1:]) if body == 0 { return 0 } @@ -50,15 +50,12 @@ func oscLengthC1[T ~string | ~[]byte](data T) int { if b == can || b == sub { return i } - if b == esc && i+1 < len(data) && data[i+1] == '\\' { - return i + 2 - } } return -1 } -// stSequenceLengthC1 parses DCS/SOS/PM/APC bodies that may -// terminate with either 7-bit ST (ESC \) or C1 ST (0x9C). +// stSequenceLengthC1 parses DCS/SOS/PM/APC bodies that terminate with C1 ST +// (0x9C), or are canceled by CAN/SUB. func stSequenceLengthC1[T ~string | ~[]byte](data T) int { for i := 0; i < len(data); i++ { if data[i] == can || data[i] == sub { @@ -67,9 +64,6 @@ func stSequenceLengthC1[T ~string | ~[]byte](data T) int { if data[i] == st { return i + 1 } - if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' { - return i + 2 - } } return -1 } diff --git a/graphemes/ansi_test.go b/graphemes/ansi_test.go index e42d018..74137ea 100644 --- a/graphemes/ansi_test.go +++ b/graphemes/ansi_test.go @@ -87,12 +87,12 @@ func TestAnsiEscapeSequences8BitOnlyAsGraphemes(t *testing.T) { {name: "C1 CSI then text", input: "\x9B31mhello", expected: []string{"\x9B31m", "h", "e", "l", "l", "o"}}, {name: "C1 CSI multiple params", input: "\x9B1;2;3m", expected: []string{"\x9B1;2;3m"}}, {name: "C1 OSC with C1 ST", input: "\x9D0;Title\x9C", expected: []string{"\x9D0;Title\x9C"}}, - {name: "C1 OSC with 7-bit ST", input: "\x9D0;Title\x1b\\", expected: []string{"\x9D0;Title\x1b\\"}}, + {name: "C1 OSC with 7-bit ST is not parsed as one sequence", input: "\x9D0;Title\x1b\\", expected: []string{"\x9D", "0", ";", "T", "i", "t", "l", "e", "\x1b", "\\"}}, {name: "C1 DCS with C1 ST", input: "\x90qpayload\x9C", expected: []string{"\x90qpayload\x9C"}}, - {name: "C1 DCS with 7-bit ST", input: "\x90qpayload\x1b\\", expected: []string{"\x90qpayload\x1b\\"}}, + {name: "C1 DCS with 7-bit ST is not parsed as one sequence", input: "\x90qpayload\x1b\\", expected: []string{"\x90", "q", "p", "a", "y", "l", "o", "a", "d", "\x1b", "\\"}}, {name: "C1 DCS canceled by CAN", input: "\x90qpayload\x18x", expected: []string{"\x90qpayload", "\x18", "x"}}, {name: "C1 SOS with C1 ST", input: "\x98hello\x9C", expected: []string{"\x98hello\x9C"}}, - {name: "C1 PM with 7-bit ST", input: "\x9Emsg\x1b\\", expected: []string{"\x9Emsg\x1b\\"}}, + {name: "C1 PM with 7-bit ST is not parsed as one sequence", input: "\x9Emsg\x1b\\", expected: []string{"\x9E", "m", "s", "g", "\x1b", "\\"}}, {name: "C1 APC with C1 ST", input: "\x9Fdata\x9C", expected: []string{"\x9Fdata\x9C"}}, {name: "single C1 Fe control", input: "\x84", expected: []string{"\x84"}}, {name: "C1 OSC unterminated", input: "\x9D0;title", expected: []string{"\x9D", "0", ";", "t", "i", "t", "l", "e"}}, @@ -270,7 +270,7 @@ func TestAnsiEscapeSequencesBothEnabledAsGraphemes(t *testing.T) { { name: "C1 OSC with 7-bit ST terminator", input: "\x9D0;Title\x1b\\", - expected: []string{"\x9D0;Title\x1b\\"}, + expected: []string{"\x9D", "0", ";", "T", "i", "t", "l", "e", "\x1b\\"}, }, { name: "7-bit OSC with C1 ST terminator", @@ -290,7 +290,7 @@ func TestAnsiEscapeSequencesBothEnabledAsGraphemes(t *testing.T) { { name: "C1 DCS with 7-bit ST terminator", input: "\x90qpayload\x1b\\", - expected: []string{"\x90qpayload\x1b\\"}, + expected: []string{"\x90", "q", "p", "a", "y", "l", "o", "a", "d", "\x1b\\"}, }, { name: "7-bit DCS with C1 ST terminator", @@ -340,7 +340,7 @@ func TestAnsiEscapeSequencesBothEnabledAsGraphemes(t *testing.T) { { name: "C1 PM with 7-bit ST terminator", input: "\x9Emsg\x1b\\", - expected: []string{"\x9Emsg\x1b\\"}, + expected: []string{"\x9E", "m", "s", "g", "\x1b\\"}, }, { name: "C1 APC with C1 ST terminator", diff --git a/graphemes/comparative/comparative_test.go b/graphemes/comparative/comparative_test.go index f91de82..b28a4e2 100644 --- a/graphemes/comparative/comparative_test.go +++ b/graphemes/comparative/comparative_test.go @@ -111,11 +111,8 @@ func TestAnsiBoundaryAgreement(t *testing.T) { // C1 8-bit {"C1 CSI then text", "\x9B31mhello"}, {"C1 OSC with C1 ST", "\x9D0;Title\x9C"}, - {"C1 OSC with 7-bit ST", "\x9D0;Title\x1b\\"}, - {"C1 DCS with 7-bit ST", "\x90qpayload\x1b\\"}, {"C1 DCS with C1 ST", "\x90qpayload\x9C"}, {"C1 SOS with C1 ST", "\x98hello\x9C"}, - {"C1 PM with 7-bit ST", "\x9Emsg\x1b\\"}, {"C1 APC with C1 ST", "\x9Fdata\x9C"}, // CSI variants (from charmbracelet test suite) From baed4dba8cb6811b05e011b517ad97297185cfe8 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 16:34:56 -0500 Subject: [PATCH 09/15] Add fuzz for ANSI --- graphemes/fuzz_test.go | 70 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 70 insertions(+) diff --git a/graphemes/fuzz_test.go b/graphemes/fuzz_test.go index d6b840a..9d39677 100644 --- a/graphemes/fuzz_test.go +++ b/graphemes/fuzz_test.go @@ -153,3 +153,73 @@ func FuzzInvalid(f *testing.F) { } }) } + +// FuzzANSIOptions fuzzes iterator roundtripping with ANSI options enabled. +// This specifically exercises 7-bit only, 8-bit only, and combined modes. +func FuzzANSIOptions(f *testing.F) { + if testing.Short() { + f.Skip("skipping fuzz test in short mode") + } + + seeds := [][]byte{ + []byte("\x1b[31mhello\x1b[0m"), // 7-bit CSI + []byte("\x1b]0;Title\x07"), // 7-bit OSC + BEL + []byte("\x1bPqpayload\x1b\\"), // 7-bit DCS + 7-bit ST + []byte("\x9B31mhello"), // C1 CSI + []byte("\x9D0;Title\x9C"), // C1 OSC + C1 ST + []byte("\x90qpayload\x9C"), // C1 DCS + C1 ST + []byte("\x98hello\x9C"), // C1 SOS + C1 ST + []byte("\x9Emsg\x9C"), // C1 PM + C1 ST + []byte("\x9Fdata\x9C"), // C1 APC + C1 ST + []byte("\x1b]0;Title\x9C"), // 7-bit initiator + C1 ST (strict negative) + []byte("\x9D0;Title\x1b\\"), // C1 initiator + 7-bit ST (strict negative) + []byte("\x1b]0;本\x07"), // UTF-8 in OSC payload + []byte("\x90q本\x9C"), // UTF-8 in C1 DCS payload + []byte("\x1b[31m\x9B1;32mtext\x1b[0m"), // mixed 7-bit + 8-bit CSI + []byte("\x1b"), // truncated ESC + []byte("\x9D0;unterminated"), // unterminated C1 OSC + []byte("plain UTF-8: café 日本語 👩🏽‍💻"), // non-ANSI UTF-8 + } + for _, s := range seeds { + f.Add(s) + } + + f.Fuzz(func(t *testing.T, original []byte) { + validOriginal := utf8.Valid(original) + + modes := []struct { + name string + ansi7Bit bool + ansi8Bit bool + }{ + {name: "off", ansi7Bit: false, ansi8Bit: false}, + {name: "7bit", ansi7Bit: true, ansi8Bit: false}, + {name: "8bit", ansi7Bit: false, ansi8Bit: true}, + {name: "both", ansi7Bit: true, ansi8Bit: true}, + } + + for _, mode := range modes { + tokens := graphemes.FromBytes(original) + tokens.AnsiEscapeSequences = mode.ansi7Bit + tokens.AnsiEscapeSequences8Bit = mode.ansi8Bit + + var all [][]byte + for tokens.Next() { + all = append(all, tokens.Value()) + } + + roundtrip := make([]byte, 0, len(original)) + for _, s := range all { + roundtrip = append(roundtrip, s...) + } + + if !bytes.Equal(roundtrip, original) { + t.Fatalf("%s mode: bytes did not roundtrip", mode.name) + } + + if validOriginal != utf8.Valid(roundtrip) { + t.Fatalf("%s mode: utf8 validity of original did not match roundtrip", mode.name) + } + } + }) +} From e2754ac2988f8ed11d691fa3db79b1242d06a4ee Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 16:54:37 -0500 Subject: [PATCH 10/15] Add fuzz to Actions --- .github/workflows/gofuzz.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/gofuzz.yml b/.github/workflows/gofuzz.yml index 0168a66..13f29dd 100644 --- a/.github/workflows/gofuzz.yml +++ b/.github/workflows/gofuzz.yml @@ -11,7 +11,7 @@ jobs: strategy: matrix: package: [words, sentences, graphemes, phrases] - fuzzer: [FuzzValidShort, FuzzValidLong, FuzzInvalid] + fuzzer: [FuzzValidShort, FuzzValidLong, FuzzInvalid, FuzzANSIOptions] runs-on: ubuntu-latest steps: - name: Set up Go From 43a34cf68d4c586d4bb19a87d5a635b24e73de06 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 22:17:24 -0500 Subject: [PATCH 11/15] a bit more compat --- graphemes/comparative/comparative_test.go | 67 +++++++++++++++++++++++ 1 file changed, 67 insertions(+) diff --git a/graphemes/comparative/comparative_test.go b/graphemes/comparative/comparative_test.go index b28a4e2..2e3b8ef 100644 --- a/graphemes/comparative/comparative_test.go +++ b/graphemes/comparative/comparative_test.go @@ -159,6 +159,19 @@ func TestAnsiBoundaryAgreement(t *testing.T) { // Plain text (no ANSI) {"plain ASCII", "hello world"}, + + // DecodeSequence parser parity edge cases + {"single ESC byte", "\x1b"}, + {"single NUL byte", "\x00"}, + {"ASCII DEL byte", "\x7f"}, + {"DEL between ASCII runes", "a\x7fb"}, + {"double ESC", "\x1b\x1b"}, + {"double ST 7-bit", "\x1b\\\x1b\\"}, + {"double ST 8-bit", "\x9c\x9c"}, + {"single-param OSC", "\x1b]112\x07"}, + {"ESC with intermediate", "\x1b Q"}, + {"DCS containing DEL payload", "\x1bP1;2+xa\x7fb\x1b\\"}, + {"OSC with C1 bytes in payload", "\x1b]11;\x90?\x1b\\"}, } for _, tt := range tests { @@ -172,6 +185,60 @@ func TestAnsiBoundaryAgreement(t *testing.T) { } } +// TestAnsiBoundaryKnownDivergences documents cases where our grapheme-oriented +// tokenizer intentionally differs from charmbracelet/x/ansi DecodeSequence. +func TestAnsiBoundaryKnownDivergences(t *testing.T) { + tests := []struct { + name string + input string + reason string + }{ + { + name: "unterminated CSI", + input: "\x1b[1;2;3", + reason: "DecodeSequence returns one unterminated CSI token; we split when no final byte is present", + }, + { + name: "unterminated OSC", + input: "\x1b]11;ff/00/ff", + reason: "DecodeSequence returns one unterminated OSC token; we split when OSC has no BEL/ST/CAN/SUB terminator", + }, + { + name: "unterminated OSC followed by CSI", + input: "\x1b]11;ff/00/ff\x1b[1;2;3m", + reason: "DecodeSequence ends OSC at ESC and parses following CSI; we require explicit OSC terminator", + }, + { + name: "unterminated OSC followed by bare ESC", + input: "\x1b]11;ff/00/ff\x1b", + reason: "DecodeSequence emits unterminated OSC then ESC; we split because OSC is invalid without terminator", + }, + { + name: "unterminated DCS", + input: "\x1bP1;2+xa", + reason: "DecodeSequence returns one unterminated DCS token; we split when DCS has no ST/CAN/SUB terminator", + }, + { + name: "invalid DCS immediately terminated", + input: "\x1bP\x1b\\ab", + reason: "DecodeSequence emits ESC P token before ST; we do not treat invalid DCS start as a sequence", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ours := uax29Tokens(tt.input) + theirs := charmTokens(tt.input) + if reflect.DeepEqual(ours, theirs) { + t.Fatalf("expected divergence, but boundaries matched\nreason: %s\ntokens: %q", tt.reason, ours) + } + t.Logf("reason: %s", tt.reason) + t.Logf("ours: %q", ours) + t.Logf("theirs: %q", theirs) + }) + } +} + // ansiSample builds a realistic ANSI-heavy string simulating colored terminal output. func ansiSample() string { var b strings.Builder From 0072a1b0f46f5e4564db1d360d564267f4d3c101 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 22:59:22 -0500 Subject: [PATCH 12/15] tweak --- graphemes/ansi8.go | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/graphemes/ansi8.go b/graphemes/ansi8.go index a5fde3b..ec66446 100644 --- a/graphemes/ansi8.go +++ b/graphemes/ansi8.go @@ -58,10 +58,11 @@ func oscLengthC1[T ~string | ~[]byte](data T) int { // (0x9C), or are canceled by CAN/SUB. func stSequenceLengthC1[T ~string | ~[]byte](data T) int { for i := 0; i < len(data); i++ { - if data[i] == can || data[i] == sub { + b := data[i] + if b == can || b == sub { return i } - if data[i] == st { + if b == st { return i + 1 } } From 902cb81f35ceef5005db822b4e354999f9b729ec Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 23:06:11 -0500 Subject: [PATCH 13/15] comment --- graphemes/ansi8.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/graphemes/ansi8.go b/graphemes/ansi8.go index ec66446..d9b0c48 100644 --- a/graphemes/ansi8.go +++ b/graphemes/ansi8.go @@ -41,6 +41,15 @@ func ansiEscapeLength8Bit[T ~string | ~[]byte](data T) int { return 0 } +// oscLengthC1 returns the length of a C1 OSC body. +// data is the slice after the C1 OSC initiator (0x9D). +// +// Returns: +// - n >= 0: consumed body length (includes BEL/ST terminator when present) +// - -1: not terminated in the provided data +// +// Terminators: BEL (0x07) or C1 ST (0x9C). +// CAN (0x18) and SUB (0x1A) cancel the control string. func oscLengthC1[T ~string | ~[]byte](data T) int { for i := 0; i < len(data); i++ { b := data[i] From 81b723c283133749c02e3b6a7a4d9b23140dceb6 Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Sun, 15 Feb 2026 23:06:26 -0500 Subject: [PATCH 14/15] more efficient CI fuzz --- .github/workflows/gofuzz.yml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/.github/workflows/gofuzz.yml b/.github/workflows/gofuzz.yml index 13f29dd..ca6306f 100644 --- a/.github/workflows/gofuzz.yml +++ b/.github/workflows/gofuzz.yml @@ -11,7 +11,10 @@ jobs: strategy: matrix: package: [words, sentences, graphemes, phrases] - fuzzer: [FuzzValidShort, FuzzValidLong, FuzzInvalid, FuzzANSIOptions] + fuzzer: [FuzzValidShort, FuzzValidLong, FuzzInvalid] + include: + - package: graphemes + fuzzer: FuzzANSIOptions runs-on: ubuntu-latest steps: - name: Set up Go From 1adff8283a122837c7a533313b24125e8c9e832e Mon Sep 17 00:00:00 2001 From: Matt Sherman Date: Mon, 16 Feb 2026 09:51:07 -0500 Subject: [PATCH 15/15] comments --- graphemes/iterator.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/graphemes/iterator.go b/graphemes/iterator.go index a3f9aac..90d669a 100644 --- a/graphemes/iterator.go +++ b/graphemes/iterator.go @@ -27,12 +27,12 @@ type Iterator[T ~string | ~[]byte] struct { data T pos int start int - // AnsiEscapeSequences treats 7-bit ANSI escape sequences (ECMA-48) as + // AnsiEscapeSequences treats 7-bit C0 ANSI escape sequences (ECMA-48) as // single grapheme clusters when true. The default is false. // // 8-bit controls are not enabled by this option. See [AnsiEscapeSequences8Bit]. AnsiEscapeSequences bool - // AnsiEscapeSequences8Bit treats 8-bit C1 control codes (ECMA-48) as single + // AnsiEscapeSequences8Bit treats 8-bit C1 ANSI escape sequences (ECMA-48) as single // grapheme clusters when true. The default is false. // // 8-bit control bytes are not UTF-8 encoded, i.e. not valid UTF-8. If you @@ -52,7 +52,7 @@ const ( bel = 0x07 can = 0x18 sub = 0x1A - st = 0x9C // C1 String Terminator + st = 0x9C ) // Next advances the iterator to the next grapheme cluster.