diff --git a/.github/workflows/gofuzz.yml b/.github/workflows/gofuzz.yml index 0168a66..ca6306f 100644 --- a/.github/workflows/gofuzz.yml +++ b/.github/workflows/gofuzz.yml @@ -12,6 +12,9 @@ jobs: matrix: package: [words, sentences, graphemes, phrases] fuzzer: [FuzzValidShort, FuzzValidLong, FuzzInvalid] + include: + - package: graphemes + fuzzer: FuzzANSIOptions runs-on: ubuntu-latest steps: - name: Set up Go diff --git a/graphemes/README.md b/graphemes/README.md index ca146e0..3f8a5e3 100644 --- a/graphemes/README.md +++ b/graphemes/README.md @@ -74,7 +74,7 @@ for g.Next() { // Next() returns true until end of data ### ANSI escape sequences -By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat these sequences as a single cluster, set the `AnsiEscapeSequences` option to true. +By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat 7-bit ANSI escape sequences as a single cluster, set `AnsiEscapeSequences` to true. ```go text := "Hello, \x1b[31mworld\x1b[0m!" @@ -86,7 +86,17 @@ for g.Next() { } ``` -We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) C0 and C1 control codes, 7-bit and 8-bit, in UTF-8 encoding. +To also parse 8-bit C1 controls (non-UTF-8 bytes), set `AnsiEscapeSequences8Bit` to true. + +```go +g.AnsiEscapeSequences = true // 7-bit forms (ESC ...) +g.AnsiEscapeSequences8Bit = true // 8-bit C1 forms (0x80-0x9F), not valid UTF-8 +``` + +For ESC-initiated (7-bit) control strings, only 7-bit terminators are recognized. +For C1-initiated (8-bit) control strings, only C1 ST (`0x9C`) is recognized as ST. + +We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) control codes in both 7-bit and 8-bit representations. 8-bit control codes are not UTF-8 encoded and are not valid UTF-8, caveat emptor. ### Benchmarks diff --git a/graphemes/ansi.go b/graphemes/ansi.go index 3a038db..9cd09b4 100644 --- a/graphemes/ansi.go +++ b/graphemes/ansi.go @@ -1,108 +1,72 @@ package graphemes -// ansiEscapeLength returns the byte length of a valid ANSI escape/control +// ansiEscapeLength returns the byte length of a valid 7-bit ANSI escape // sequence at the start of data, or 0 if none. // -// Input is UTF-8. This recognizes both: -// - 7-bit representations (ESC + final/intermediate bytes), and -// - UTF-8 encodings of 8-bit C1 controls (U+0080..U+009F => 0xC2 0x80..0x9F). -// // Recognized forms (ECMA-48 / ISO 6429): -// - CSI: ESC [ then parameter bytes (0x30–0x3F), intermediate (0x20–0x2F), final (0x40–0x7E) -// - OSC: ESC ] then payload until ST (ESC \) or BEL (0x07) -// - DCS, SOS, PM, APC: ESC P / X / ^ / _ then payload until ST (ESC \) -// - Two-byte: ESC + Fe/Fs (0x40–0x7E excluding above), or Fp (0x30–0x3F), or nF (0x20–0x2F then final) +// - CSI: ESC [ then parameter bytes (0x30-0x3F), intermediate (0x20-0x2F), final (0x40-0x7E) +// - OSC: ESC ] then payload until BEL (0x07), 7-bit ST (ESC \), CAN (0x18), or SUB (0x1A) +// - DCS, SOS, PM, APC: ESC P/X/^/_ then payload until 7-bit ST (ESC \), CAN, or SUB +// - Two-byte: ESC + Fe/Fs (0x40-0x7E excluding above), or Fp (0x30-0x3F), or nF (0x20-0x2F then final) func ansiEscapeLength[T ~string | ~[]byte](data T) int { n := len(data) - if n < 2 { + if n < 2 || data[0] != esc { return 0 } - switch data[0] { - case esc: - b1 := data[1] - switch b1 { - case '[': // CSI - body := csiLength(data[2:]) - if body == 0 { - return 0 - } - return 2 + body - case ']': // OSC – allows BEL or ST as terminator - body := oscLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - case 'P', 'X', '^', '_': // DCS, SOS, PM, APC – require ST only - body := stSequenceLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - } - if b1 >= 0x40 && b1 <= 0x7E { - // Fe/Fs two-byte; [ ] P X ^ _ handled above - return 2 - } - if b1 >= 0x30 && b1 <= 0x3F { - // Fp (private) two-byte - return 2 + b1 := data[1] + switch b1 { + case '[': // CSI + body := csiBodyLength(data[2:]) + if body == 0 { + return 0 } - if b1 >= 0x20 && b1 <= 0x2F { - // nF: intermediates then one final (0x30–0x7E) - i := 2 - for i < n && data[i] >= 0x20 && data[i] <= 0x2F { - i++ - } - if i < n && data[i] >= 0x30 && data[i] <= 0x7E { - return i + 1 - } + return 2 + body + case ']': // OSC - allows BEL or 7-bit ST terminator + body := oscLength(data[2:]) + if body < 0 { return 0 } - - case c1UTF8Lead: - b1 := data[1] - if b1 < 0x80 || b1 > 0x9F { + return 2 + body + case 'P', 'X', '^', '_': // DCS, SOS, PM, APC + body := stSequenceLength(data[2:]) + if body < 0 { return 0 } + return 2 + body + } - switch b1 { - case 0x9B: // CSI - body := csiLength(data[2:]) - if body == 0 { - return 0 - } - return 2 + body - case 0x9D: // OSC – allows BEL or ST as terminator - body := oscLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - case 0x90, 0x98, 0x9E, 0x9F: // DCS, SOS, PM, APC – require ST only - body := stSequenceLength(data[2:]) - if body < 0 { - return 0 - } - return 2 + body - default: - // Any other C1 control (UTF-8 encoded) is one control sequence token. - return 2 + if b1 >= 0x40 && b1 <= 0x7E { + // Fe/Fs two-byte; [ ] P X ^ _ handled above + return 2 + } + if b1 >= 0x30 && b1 <= 0x3F { + // Fp (private) two-byte + return 2 + } + if b1 >= 0x20 && b1 <= 0x2F { + // nF: intermediates then one final (0x30-0x7E) + i := 2 + for i < n && data[i] >= 0x20 && data[i] <= 0x2F { + i++ } + if i < n && data[i] >= 0x30 && data[i] <= 0x7E { + return i + 1 + } + return 0 } return 0 } -// csiLength returns the length of the CSI body (param/intermediate/final bytes). +// csiBodyLength returns the length of the CSI body (param/intermediate/final bytes). // data is the slice after "ESC [". // Per ECMA-48, the CSI body has the form: // // parameters (0x30–0x3F)*, intermediates (0x20–0x2F)*, final (0x40–0x7E) // // Once an intermediate byte is seen, subsequent parameter bytes are invalid. -func csiLength[T ~string | ~[]byte](data T) int { +func csiBodyLength[T ~string | ~[]byte](data T) int { seenIntermediate := false for i := 0; i < len(data); i++ { b := data[i] @@ -125,13 +89,13 @@ func csiLength[T ~string | ~[]byte](data T) int { } // oscLength returns the length of the OSC body. -// data is the slice after "ESC ]" (or C1 OSC). +// data is the slice after "ESC ]". // // Returns: // - n >= 0: consumed body length (includes BEL/ST terminator when present) // - -1: not terminated in the provided data // -// OSC accepts BEL (0x07) or ST as terminator by widespread convention. +// OSC accepts BEL (0x07) or 7-bit ST (ESC \) as terminators by widespread convention. // Per ECMA-48, CAN (0x18) and SUB (0x1A) cancel the control string; in that // case they are not part of the OSC sequence length. func oscLength[T ~string | ~[]byte](data T) int { @@ -146,21 +110,19 @@ func oscLength[T ~string | ~[]byte](data T) int { if b == esc && i+1 < len(data) && data[i+1] == '\\' { return i + 2 } - if b == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C { - return i + 2 - } } return -1 } // stSequenceLength returns the length of a control-string body. -// data is the slice after "ESC x" (or C1 DCS/SOS/PM/APC). +// data is the slice after "ESC x". // // Returns: // - n >= 0: consumed body length (includes ST terminator when present) // - -1: not terminated in the provided data // // Used for DCS, SOS, PM, and APC, which per ECMA-48 terminate with ST. +// ST here is the 7-bit form (ESC \). // CAN (0x18) and SUB (0x1A) cancel the control string; in that case they are // not part of the sequence length. func stSequenceLength[T ~string | ~[]byte](data T) int { @@ -171,9 +133,6 @@ func stSequenceLength[T ~string | ~[]byte](data T) int { if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' { return i + 2 } - if data[i] == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C { - return i + 2 - } } return -1 } diff --git a/graphemes/ansi8.go b/graphemes/ansi8.go new file mode 100644 index 0000000..d9b0c48 --- /dev/null +++ b/graphemes/ansi8.go @@ -0,0 +1,79 @@ +package graphemes + +// ansiEscapeLength8Bit returns the byte length of a valid 8-bit C1 ANSI +// sequence at the start of data, or 0 if none. +// +// Recognized forms (ECMA-48 / ISO 6429): +// - C1 CSI (0x9B) body as parameter/intermediate/final bytes +// - C1 OSC (0x9D) body terminated by BEL, C1 ST, CAN, or SUB +// - C1 DCS/SOS/PM/APC (0x90/0x98/0x9E/0x9F) body terminated by C1 ST, CAN, or SUB +// - Standalone C1 controls (0x80..0x9F not listed above): single byte +func ansiEscapeLength8Bit[T ~string | ~[]byte](data T) int { + if len(data) == 0 { + return 0 + } + + switch data[0] { + case 0x9B: // C1 CSI + body := csiBodyLength(data[1:]) + if body == 0 { + return 0 + } + return 1 + body + case 0x9D: // C1 OSC + body := oscLengthC1(data[1:]) + if body < 0 { + return 0 + } + return 1 + body + case 0x90, 0x98, 0x9E, 0x9F: // C1 DCS, SOS, PM, APC + body := stSequenceLengthC1(data[1:]) + if body < 0 { + return 0 + } + return 1 + body + default: + if data[0] >= 0x80 && data[0] <= 0x9F { + return 1 + } + } + + return 0 +} + +// oscLengthC1 returns the length of a C1 OSC body. +// data is the slice after the C1 OSC initiator (0x9D). +// +// Returns: +// - n >= 0: consumed body length (includes BEL/ST terminator when present) +// - -1: not terminated in the provided data +// +// Terminators: BEL (0x07) or C1 ST (0x9C). +// CAN (0x18) and SUB (0x1A) cancel the control string. +func oscLengthC1[T ~string | ~[]byte](data T) int { + for i := 0; i < len(data); i++ { + b := data[i] + if b == bel || b == st { + return i + 1 + } + if b == can || b == sub { + return i + } + } + return -1 +} + +// stSequenceLengthC1 parses DCS/SOS/PM/APC bodies that terminate with C1 ST +// (0x9C), or are canceled by CAN/SUB. +func stSequenceLengthC1[T ~string | ~[]byte](data T) int { + for i := 0; i < len(data); i++ { + b := data[i] + if b == can || b == sub { + return i + } + if b == st { + return i + 1 + } + } + return -1 +} diff --git a/graphemes/ansi_test.go b/graphemes/ansi_test.go index 32b37f1..74137ea 100644 --- a/graphemes/ansi_test.go +++ b/graphemes/ansi_test.go @@ -9,14 +9,104 @@ import ( "github.com/clipperhouse/uax29/v2/testdata" ) -func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { +type ansiCase struct { + name string + input string + expected []string +} + +func assertANSITokens(t *testing.T, input string, expected []string, sevenBit, eightBit bool) { + t.Helper() + + assertEqual := func(kind string, got []string) { + t.Helper() + if !reflect.DeepEqual(got, expected) { + t.Errorf("%s mismatch\ngot %q\nexpected %q", kind, got, expected) + } + } + + iterString := graphemes.FromString(input) + iterString.AnsiEscapeSequences = sevenBit + iterString.AnsiEscapeSequences8Bit = eightBit + var gotString []string + for iterString.Next() { + gotString = append(gotString, iterString.Value()) + } + assertEqual("string", gotString) + + iterBytes := graphemes.FromBytes([]byte(input)) + iterBytes.AnsiEscapeSequences = sevenBit + iterBytes.AnsiEscapeSequences8Bit = eightBit + var gotBytes []string + for iterBytes.Next() { + gotBytes = append(gotBytes, string(iterBytes.Value())) + } + assertEqual("bytes", gotBytes) +} + +func runANSICases(t *testing.T, tests []ansiCase, sevenBit, eightBit bool) { + t.Helper() + for _, tt := range tests { + tt := tt + t.Run(tt.name, func(t *testing.T) { + t.Parallel() + assertANSITokens(t, tt.input, tt.expected, sevenBit, eightBit) + }) + } +} + +func TestAnsiEscapeSequences7BitOnlyAsGraphemes(t *testing.T) { t.Parallel() - tests := []struct { - name string - input string - expected []string - }{ + tests := []ansiCase{ + {name: "SGR reset", input: "\x1b[0m", expected: []string{"\x1b[0m"}}, + {name: "SGR red then text", input: "\x1b[31mhello", expected: []string{"\x1b[31m", "h", "e", "l", "l", "o"}}, + {name: "CSI with valid intermediate", input: "\x1b[0 q", expected: []string{"\x1b[0 q"}}, + {name: "OSC window title then BEL", input: "\x1b]0;My Title\x07", expected: []string{"\x1b]0;My Title\x07"}}, + {name: "OSC window title then ST", input: "\x1b]0;Title\x1b\\", expected: []string{"\x1b]0;Title\x1b\\"}}, + {name: "DCS with ST terminator", input: "\x1bPq#0;2;0;0;0\x1b\\", expected: []string{"\x1bPq#0;2;0;0;0\x1b\\"}}, + {name: "DCS canceled by CAN", input: "\x1bPqdata\x18z", expected: []string{"\x1bPqdata", "\x18", "z"}}, + {name: "SOS with ST terminator", input: "\x1bXhello\x1b\\", expected: []string{"\x1bXhello\x1b\\"}}, + {name: "PM with ST terminator", input: "\x1b^msg\x1b\\", expected: []string{"\x1b^msg\x1b\\"}}, + {name: "APC with ST terminator", input: "\x1b_data\x1b\\", expected: []string{"\x1b_data\x1b\\"}}, + {name: "two-byte Fe", input: "\x1bD", expected: []string{"\x1bD"}}, + {name: "two-byte Fp", input: "\x1b7", expected: []string{"\x1b7"}}, + {name: "nF with multiple intermediates", input: "\x1b !Fx", expected: []string{"\x1b !F", "x"}}, + {name: "malformed CSI remains split", input: "\x1b[ 1mok", expected: []string{"\x1b", "[", " ", "1", "m", "o", "k"}}, + {name: "C1 CSI is not parsed", input: "\x9B31mhello", expected: []string{"\x9B", "3", "1", "m", "h", "e", "l", "l", "o"}}, + {name: "7-bit OSC does not accept C1 ST", input: "\x1b]0;Title\x9Cz", expected: []string{"\x1b", "]", "0", ";", "T", "i", "t", "l", "e", "\x9C", "z"}}, + } + + runANSICases(t, tests, true, false) +} + +func TestAnsiEscapeSequences8BitOnlyAsGraphemes(t *testing.T) { + t.Parallel() + + tests := []ansiCase{ + {name: "C1 CSI then text", input: "\x9B31mhello", expected: []string{"\x9B31m", "h", "e", "l", "l", "o"}}, + {name: "C1 CSI multiple params", input: "\x9B1;2;3m", expected: []string{"\x9B1;2;3m"}}, + {name: "C1 OSC with C1 ST", input: "\x9D0;Title\x9C", expected: []string{"\x9D0;Title\x9C"}}, + {name: "C1 OSC with 7-bit ST is not parsed as one sequence", input: "\x9D0;Title\x1b\\", expected: []string{"\x9D", "0", ";", "T", "i", "t", "l", "e", "\x1b", "\\"}}, + {name: "C1 DCS with C1 ST", input: "\x90qpayload\x9C", expected: []string{"\x90qpayload\x9C"}}, + {name: "C1 DCS with 7-bit ST is not parsed as one sequence", input: "\x90qpayload\x1b\\", expected: []string{"\x90", "q", "p", "a", "y", "l", "o", "a", "d", "\x1b", "\\"}}, + {name: "C1 DCS canceled by CAN", input: "\x90qpayload\x18x", expected: []string{"\x90qpayload", "\x18", "x"}}, + {name: "C1 SOS with C1 ST", input: "\x98hello\x9C", expected: []string{"\x98hello\x9C"}}, + {name: "C1 PM with 7-bit ST is not parsed as one sequence", input: "\x9Emsg\x1b\\", expected: []string{"\x9E", "m", "s", "g", "\x1b", "\\"}}, + {name: "C1 APC with C1 ST", input: "\x9Fdata\x9C", expected: []string{"\x9Fdata\x9C"}}, + {name: "single C1 Fe control", input: "\x84", expected: []string{"\x84"}}, + {name: "C1 OSC unterminated", input: "\x9D0;title", expected: []string{"\x9D", "0", ";", "t", "i", "t", "l", "e"}}, + {name: "C1 DCS unterminated", input: "\x90data", expected: []string{"\x90", "d", "a", "t", "a"}}, + {name: "7-bit ESC sequence is not parsed", input: "\x1b[31mhello", expected: []string{"\x1b", "[", "3", "1", "m", "h", "e", "l", "l", "o"}}, + } + + runANSICases(t, tests, false, true) +} + +func TestAnsiEscapeSequencesBothEnabledAsGraphemes(t *testing.T) { + t.Parallel() + + tests := []ansiCase{ { name: "SGR reset", input: "\x1b[0m", @@ -37,6 +127,11 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { input: "\x1b]0;My Title\x07", expected: []string{"\x1b]0;My Title\x07"}, }, + { + name: "OSC UTF-8 payload does not terminate at continuation byte", + input: "\x1b]0;本\x07", + expected: []string{"\x1b]0;本\x07"}, + }, { name: "OSC window title then ST", input: "\x1b]0;Title\x1b\\", @@ -47,6 +142,11 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { input: "\x1bPq#0;2;0;0;0\x1b\\", expected: []string{"\x1bPq#0;2;0;0;0\x1b\\"}, }, + { + name: "DCS UTF-8 payload does not terminate at continuation byte", + input: "\x1bPq本\x1b\\", + expected: []string{"\x1bPq本\x1b\\"}, + }, { name: "DCS with BEL in payload is not a single sequence", input: "\x1bPq\x07rest", @@ -158,54 +258,49 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { expected: []string{"\x1b[m", "x"}, }, { - name: "UTF-8 C1 CSI then text", - input: "\xC2\x9B31mhello", - expected: []string{"\xC2\x9B31m", "h", "e", "l", "l", "o"}, - }, - { - name: "UTF-8 C1 OSC with UTF-8 C1 ST terminator", - input: "\xC2\x9D0;Title\xC2\x9C", - expected: []string{"\xC2\x9D0;Title\xC2\x9C"}, + name: "C1 CSI then text", + input: "\x9B31mhello", + expected: []string{"\x9B31m", "h", "e", "l", "l", "o"}, }, { - name: "UTF-8 C1 OSC with 7-bit ST terminator", - input: "\xC2\x9D0;Title\x1b\\", - expected: []string{"\xC2\x9D0;Title\x1b\\"}, + name: "C1 OSC with C1 ST terminator", + input: "\x9D0;Title\x9C", + expected: []string{"\x9D0;Title\x9C"}, }, { - name: "7-bit OSC with UTF-8 C1 ST terminator", - input: "\x1b]0;Title\xC2\x9C", - expected: []string{"\x1b]0;Title\xC2\x9C"}, + name: "C1 OSC with 7-bit ST terminator", + input: "\x9D0;Title\x1b\\", + expected: []string{"\x9D", "0", ";", "T", "i", "t", "l", "e", "\x1b\\"}, }, { - name: "UTF-8 C1 DCS with UTF-8 C1 ST terminator", - input: "\xC2\x90qpayload\xC2\x9C", - expected: []string{"\xC2\x90qpayload\xC2\x9C"}, + name: "7-bit OSC with C1 ST terminator", + input: "\x1b]0;Title\x9C", + expected: []string{"\x1b", "]", "0", ";", "T", "i", "t", "l", "e", "\x9C"}, }, { - name: "UTF-8 C1 DCS canceled by CAN", - input: "\xC2\x90qpayload\x18x", - expected: []string{"\xC2\x90qpayload", "\x18", "x"}, + name: "C1 DCS with C1 ST terminator", + input: "\x90qpayload\x9C", + expected: []string{"\x90qpayload\x9C"}, }, { - name: "UTF-8 C1 DCS with 7-bit ST terminator", - input: "\xC2\x90qpayload\x1b\\", - expected: []string{"\xC2\x90qpayload\x1b\\"}, + name: "C1 DCS canceled by CAN", + input: "\x90qpayload\x18x", + expected: []string{"\x90qpayload", "\x18", "x"}, }, { - name: "7-bit DCS with UTF-8 C1 ST terminator", - input: "\x1bPqpayload\xC2\x9C", - expected: []string{"\x1bPqpayload\xC2\x9C"}, + name: "C1 DCS with 7-bit ST terminator", + input: "\x90qpayload\x1b\\", + expected: []string{"\x90", "q", "p", "a", "y", "l", "o", "a", "d", "\x1b\\"}, }, { - name: "UTF-8 C1 Fe IND control", - input: "\xC2\x84", - expected: []string{"\xC2\x84"}, + name: "7-bit DCS with C1 ST terminator", + input: "\x1bPqpayload\x9C", + expected: []string{"\x1b", "P", "q", "p", "a", "y", "l", "o", "a", "d", "\x9C"}, }, { - name: "UTF-8 C1 lead byte for non-C1 codepoint is not ANSI", - input: "\u00A9", - expected: []string{"\u00A9"}, + name: "C1 Fe IND control", + input: "\x84", + expected: []string{"\x84"}, }, { name: "nF malformed: no final byte", @@ -228,29 +323,29 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { expected: []string{"\x1b[0 q"}, }, { - name: "UTF-8 C1 OSC unterminated", - input: "\xC2\x9D0;title", - expected: []string{"\xC2\x9D", "0", ";", "t", "i", "t", "l", "e"}, + name: "C1 OSC unterminated", + input: "\x9D0;title", + expected: []string{"\x9D", "0", ";", "t", "i", "t", "l", "e"}, }, { - name: "UTF-8 C1 DCS unterminated", - input: "\xC2\x90data", - expected: []string{"\xC2\x90", "d", "a", "t", "a"}, + name: "C1 DCS unterminated", + input: "\x90data", + expected: []string{"\x90", "d", "a", "t", "a"}, }, { - name: "UTF-8 C1 SOS with UTF-8 C1 ST terminator", - input: "\xC2\x98hello\xC2\x9C", - expected: []string{"\xC2\x98hello\xC2\x9C"}, + name: "C1 SOS with C1 ST terminator", + input: "\x98hello\x9C", + expected: []string{"\x98hello\x9C"}, }, { - name: "UTF-8 C1 PM with 7-bit ST terminator", - input: "\xC2\x9Emsg\x1b\\", - expected: []string{"\xC2\x9Emsg\x1b\\"}, + name: "C1 PM with 7-bit ST terminator", + input: "\x9Emsg\x1b\\", + expected: []string{"\x9E", "m", "s", "g", "\x1b\\"}, }, { - name: "UTF-8 C1 APC with UTF-8 C1 ST terminator", - input: "\xC2\x9Fdata\xC2\x9C", - expected: []string{"\xC2\x9Fdata\xC2\x9C"}, + name: "C1 APC with C1 ST terminator", + input: "\x9Fdata\x9C", + expected: []string{"\x9Fdata\x9C"}, }, { name: "single ESC byte", @@ -258,9 +353,29 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { expected: []string{"\x1b"}, }, { - name: "single C1 lead byte (incomplete UTF-8)", - input: "\xC2", - expected: []string{"\xC2"}, + name: "single C1 control byte", + input: "\x84", + expected: []string{"\x84"}, + }, + { + name: "UTF-8 cafe", + input: "café", + expected: []string{"c", "a", "f", "é"}, + }, + { + name: "UTF-8 Japanese text", + input: "日本語", + expected: []string{"日", "本", "語"}, + }, + { + name: "UTF-8 runes with continuation bytes in C1 range", + input: "Āğל", + expected: []string{"Ā", "ğ", "ל"}, + }, + { + name: "mixed ANSI and UTF-8 adversarial payload", + input: "\x1b[31mĀğ日本語café\x1b[0m", + expected: []string{"\x1b[31m", "Ā", "ğ", "日", "本", "語", "c", "a", "f", "é", "\x1b[0m"}, }, { name: "SOS canceled by CAN", @@ -279,32 +394,69 @@ func TestAnsiEscapeSequencesAsGraphemes(t *testing.T) { }, } - for _, tt := range tests { - tt := tt - t.Run(tt.name, func(t *testing.T) { + runANSICases(t, tests, true, true) +} + +func TestAnsiEscapeSequencesPureUTF8Parity(t *testing.T) { + t.Parallel() + + samples := []string{ + "café", + "日本語", + "Āğל", + "A\u0301", + "👩🏽‍💻", + "Résumé — 東京 — 👍", + } + + collectString := func(input string, ansi7, ansi8 bool) []string { + iter := graphemes.FromString(input) + iter.AnsiEscapeSequences = ansi7 + iter.AnsiEscapeSequences8Bit = ansi8 + var out []string + for iter.Next() { + out = append(out, iter.Value()) + } + return out + } + + collectBytes := func(input string, ansi7, ansi8 bool) []string { + iter := graphemes.FromBytes([]byte(input)) + iter.AnsiEscapeSequences = ansi7 + iter.AnsiEscapeSequences8Bit = ansi8 + var out []string + for iter.Next() { + out = append(out, string(iter.Value())) + } + return out + } + + for i, sample := range samples { + sample := sample + t.Run("sample-"+string(rune('A'+i)), func(t *testing.T) { t.Parallel() - assertEqual := func(kind string, got []string) { - t.Helper() - if !reflect.DeepEqual(got, tt.expected) { - t.Errorf("%s mismatch\ngot %q\nexpected %q", kind, got, tt.expected) - } - } - iterString := graphemes.FromString(tt.input) - iterString.AnsiEscapeSequences = true - var gotString []string - for iterString.Next() { - gotString = append(gotString, iterString.Value()) - } - assertEqual("string", gotString) + stringBase := collectString(sample, false, false) + for _, flags := range []struct { + name string + ansi7 bool + ansi8 bool + }{ + {name: "7-bit only", ansi7: true, ansi8: false}, + {name: "8-bit only", ansi7: false, ansi8: true}, + {name: "both", ansi7: true, ansi8: true}, + } { + gotString := collectString(sample, flags.ansi7, flags.ansi8) + if !reflect.DeepEqual(stringBase, gotString) { + t.Fatalf("string parity mismatch for %q (%s)\noff=%q\non=%q", sample, flags.name, stringBase, gotString) + } - iterBytes := graphemes.FromBytes([]byte(tt.input)) - iterBytes.AnsiEscapeSequences = true - var gotBytes []string - for iterBytes.Next() { - gotBytes = append(gotBytes, string(iterBytes.Value())) + bytesBase := collectBytes(sample, false, false) + gotBytes := collectBytes(sample, flags.ansi7, flags.ansi8) + if !reflect.DeepEqual(bytesBase, gotBytes) { + t.Fatalf("bytes parity mismatch for %q (%s)\noff=%q\non=%q", sample, flags.name, bytesBase, gotBytes) + } } - assertEqual("bytes", gotBytes) }) } } @@ -356,20 +508,73 @@ func ansiSample() string { return b.String() } +// ansiSample8Bit builds a string that uses 8-bit C1 initiators. +func ansiSample8Bit() string { + var b strings.Builder + + lines := []string{ + "drwxr-xr-x 5 user staff 160 Jan 1 12:00 Documents", + "drwxr-xr-x 3 user staff 96 Feb 2 09:30 Downloads", + "-rwxr-xr-x 1 user staff 8432 Mar 15 14:22 build.sh", + "lrwxr-xr-x 1 user staff 11 Apr 20 08:00 config -> /etc/config", + "-rw-r--r-- 1 user staff 1024 May 5 16:45 README.md", + "total 42", + "drwxr-xr-x 2 user staff 64 Jun 10 11:11 src", + "-rw-r--r-- 1 user staff 512 Jul 7 07:07 main.go", + "error: file not found: missing.txt", + "warning: deprecated function used in line 42", + } + + for round := 0; round < 20; round++ { + for i, line := range lines { + // C1 OSC: 0x9D ... BEL + if i%5 == 0 { + b.WriteByte(0x9D) + b.WriteString("0;terminal - round ") + b.WriteString(string(rune('0' + round%10))) + b.WriteByte(0x07) + } + // C1 CSI SGR: 0x9B ... m + b.WriteByte(0x9B) + b.WriteString("1;3") + b.WriteString(string(rune('0' + (i % 8)))) + b.WriteByte('m') + b.WriteString(line) + // C1 CSI reset: 0x9B0m + b.WriteByte(0x9B) + b.WriteString("0m") + b.WriteByte('\n') + } + } + return b.String() +} + +// ansiSampleMixed builds a string with both 7-bit and 8-bit ANSI forms. +func ansiSampleMixed() string { + var b strings.Builder + a7 := ansiSample() + a8 := ansiSample8Bit() + b.WriteString(a7) + b.WriteString(a8) + return b.String() +} + // BenchmarkAnsiOption benchmarks the iterator on text that contains ANSI escapes, // and on plain text, with the AnsiEscapeSequences option on and off. func BenchmarkAnsiOption(b *testing.B) { - ansi := ansiSample() + ansi7 := ansiSample() + ansi8 := ansiSample8Bit() + ansiMixed := ansiSampleMixed() plain, err := testdata.Sample() if err != nil { b.Fatal(err) } plainStr := string(plain) - b.Run("AnsiText/OptionOn", func(b *testing.B) { - b.SetBytes(int64(len(ansi))) + b.Run("AnsiText7Bit/Option7BitOn", func(b *testing.B) { + b.SetBytes(int64(len(ansi7))) for i := 0; i < b.N; i++ { - iter := graphemes.FromString(ansi) + iter := graphemes.FromString(ansi7) iter.AnsiEscapeSequences = true c := 0 for iter.Next() { @@ -380,10 +585,80 @@ func BenchmarkAnsiOption(b *testing.B) { } }) - b.Run("AnsiText/OptionOff", func(b *testing.B) { - b.SetBytes(int64(len(ansi))) + b.Run("AnsiText7Bit/OptionOff", func(b *testing.B) { + b.SetBytes(int64(len(ansi7))) for i := 0; i < b.N; i++ { - iter := graphemes.FromString(ansi) + iter := graphemes.FromString(ansi7) + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("AnsiText8Bit/Option8BitOn", func(b *testing.B) { + b.SetBytes(int64(len(ansi8))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(ansi8) + iter.AnsiEscapeSequences8Bit = true + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("AnsiText8Bit/OptionOff", func(b *testing.B) { + b.SetBytes(int64(len(ansi8))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(ansi8) + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("AnsiTextMixed/BothOptionsOn", func(b *testing.B) { + b.SetBytes(int64(len(ansiMixed))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(ansiMixed) + iter.AnsiEscapeSequences = true + iter.AnsiEscapeSequences8Bit = true + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("PlainText/Option7BitOn", func(b *testing.B) { + b.SetBytes(int64(len(plainStr))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(plainStr) + iter.AnsiEscapeSequences = true + c := 0 + for iter.Next() { + _ = iter.Value() + c++ + } + b.ReportMetric(float64(c), "tokens") + } + }) + + b.Run("PlainText/Option8BitOn", func(b *testing.B) { + b.SetBytes(int64(len(plainStr))) + for i := 0; i < b.N; i++ { + iter := graphemes.FromString(plainStr) + iter.AnsiEscapeSequences8Bit = true c := 0 for iter.Next() { _ = iter.Value() @@ -393,11 +668,12 @@ func BenchmarkAnsiOption(b *testing.B) { } }) - b.Run("PlainText/OptionOn", func(b *testing.B) { + b.Run("PlainText/BothOptionsOn", func(b *testing.B) { b.SetBytes(int64(len(plainStr))) for i := 0; i < b.N; i++ { iter := graphemes.FromString(plainStr) iter.AnsiEscapeSequences = true + iter.AnsiEscapeSequences8Bit = true c := 0 for iter.Next() { _ = iter.Value() diff --git a/graphemes/comparative/comparative_test.go b/graphemes/comparative/comparative_test.go index fa5ac61..2e3b8ef 100644 --- a/graphemes/comparative/comparative_test.go +++ b/graphemes/comparative/comparative_test.go @@ -1,9 +1,11 @@ package comparative import ( + "reflect" "strings" "testing" + "github.com/charmbracelet/x/ansi" "github.com/clipperhouse/uax29/v2/graphemes" "github.com/clipperhouse/uax29/v2/testdata" "github.com/rivo/uniseg" @@ -75,3 +77,325 @@ func BenchmarkGraphemesASCII(b *testing.B) { } }) } + +// TestAnsiBoundaryAgreement verifies that our ANSI sequence parsing produces +// the same token boundaries as charmbracelet/x/ansi's DecodeSequence. +// Inputs use ASCII text between sequences so grapheme clustering differences +// don't obscure ANSI boundary comparison. +func TestAnsiBoundaryAgreement(t *testing.T) { + tests := []struct { + name string + input string + }{ + // 7-bit CSI + {"SGR reset", "\x1b[0m"}, + {"SGR color then text then reset", "\x1b[31mhello\x1b[0m"}, + {"CSI bold+color", "\x1b[1;32m"}, + {"CSI cursor position", "\x1b[10;20H"}, + + // 7-bit OSC + {"OSC title with BEL", "\x1b]0;My Title\x07"}, + {"OSC title with ST", "\x1b]0;Title\x1b\\"}, + + // 7-bit DCS/SOS/PM/APC + {"DCS with ST", "\x1bPq#0;2;0;0;0\x1b\\"}, + {"SOS with ST", "\x1bXhello\x1b\\"}, + {"PM with ST", "\x1b^msg\x1b\\"}, + {"APC with ST", "\x1b_data\x1b\\"}, + + // Two-byte Fe/Fs/Fp + {"Fe IND", "\x1bD"}, + {"Fs RIS", "\x1bc"}, + {"Fp DECSC", "\x1b7"}, + + // C1 8-bit + {"C1 CSI then text", "\x9B31mhello"}, + {"C1 OSC with C1 ST", "\x9D0;Title\x9C"}, + {"C1 DCS with C1 ST", "\x90qpayload\x9C"}, + {"C1 SOS with C1 ST", "\x98hello\x9C"}, + {"C1 APC with C1 ST", "\x9Fdata\x9C"}, + + // CSI variants (from charmbracelet test suite) + {"CSI private mode", "\x1b[?1049h"}, + {"CSI subparams (colons)", "\x1b[38:2:255:0:255;1m"}, + {"CSI with intermediate", "\x1b[0 q"}, + {"CSI no params", "\x1b[m"}, + {"CSI mouse click", "\x1b[<0;1;1M"}, + {"CSI mouse wheel", "\x1b[<64;2;11m"}, + {"CSI bracketed paste on", "\x1b[?2004h"}, + {"CSI bracketed paste content", "\x1b[200~pasted text\x1b[201~"}, + + // SS3 / SS2 (Single Shift) + {"SS3 7-bit", "\x1bOA"}, + {"SS3 8-bit", "\x8fA"}, + {"SS2 7-bit", "\x1bNA"}, + {"SS2 8-bit", "\x8eA"}, + + // nF sequences + {"nF charset G0", "\x1b(A"}, + {"nF charset G0 then text", "\x1b(Btext"}, + + // DCS with params + {"C1 DCS with params and C1 ST", "\x90?123;456+q\x9c"}, + + // APC payload (Kitty graphics protocol) + {"APC kitty graphics", "\x1b_Gf=24,s=10,v=20,o=z;aGVsbG8gd29ybGQ=\x1b\\"}, + + // C1 CSI with multiple params + {"C1 CSI multiple params", "\x9B1;2;3m"}, + + // Mixed 7-bit and C1 + {"mixed 7-bit and C1", "\x1b[1m\x9B31mhello\x1b[0m"}, + + // Concatenated sequences + {"concatenated CSI+OSC", "\x1b[1;2;3m\x1b]2;Terminal\x07"}, + {"OSC then CSI", "\x1b]0;Title\x07\x1b[31mred\x1b[0m"}, + + // Text around sequences + {"text around SGR", "hello, \x1b[1;2;3mworld\x1b[0m!"}, + + // Realistic colored output + {"colored ls", "\x1b[1;34mDocuments\x1b[0m \x1b[0;32mbuild.sh\x1b[0m"}, + + // Plain text (no ANSI) + {"plain ASCII", "hello world"}, + + // DecodeSequence parser parity edge cases + {"single ESC byte", "\x1b"}, + {"single NUL byte", "\x00"}, + {"ASCII DEL byte", "\x7f"}, + {"DEL between ASCII runes", "a\x7fb"}, + {"double ESC", "\x1b\x1b"}, + {"double ST 7-bit", "\x1b\\\x1b\\"}, + {"double ST 8-bit", "\x9c\x9c"}, + {"single-param OSC", "\x1b]112\x07"}, + {"ESC with intermediate", "\x1b Q"}, + {"DCS containing DEL payload", "\x1bP1;2+xa\x7fb\x1b\\"}, + {"OSC with C1 bytes in payload", "\x1b]11;\x90?\x1b\\"}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ours := uax29Tokens(tt.input) + theirs := charmTokens(tt.input) + if !reflect.DeepEqual(ours, theirs) { + t.Errorf("boundary mismatch\nours: %q\ntheirs: %q", ours, theirs) + } + }) + } +} + +// TestAnsiBoundaryKnownDivergences documents cases where our grapheme-oriented +// tokenizer intentionally differs from charmbracelet/x/ansi DecodeSequence. +func TestAnsiBoundaryKnownDivergences(t *testing.T) { + tests := []struct { + name string + input string + reason string + }{ + { + name: "unterminated CSI", + input: "\x1b[1;2;3", + reason: "DecodeSequence returns one unterminated CSI token; we split when no final byte is present", + }, + { + name: "unterminated OSC", + input: "\x1b]11;ff/00/ff", + reason: "DecodeSequence returns one unterminated OSC token; we split when OSC has no BEL/ST/CAN/SUB terminator", + }, + { + name: "unterminated OSC followed by CSI", + input: "\x1b]11;ff/00/ff\x1b[1;2;3m", + reason: "DecodeSequence ends OSC at ESC and parses following CSI; we require explicit OSC terminator", + }, + { + name: "unterminated OSC followed by bare ESC", + input: "\x1b]11;ff/00/ff\x1b", + reason: "DecodeSequence emits unterminated OSC then ESC; we split because OSC is invalid without terminator", + }, + { + name: "unterminated DCS", + input: "\x1bP1;2+xa", + reason: "DecodeSequence returns one unterminated DCS token; we split when DCS has no ST/CAN/SUB terminator", + }, + { + name: "invalid DCS immediately terminated", + input: "\x1bP\x1b\\ab", + reason: "DecodeSequence emits ESC P token before ST; we do not treat invalid DCS start as a sequence", + }, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + ours := uax29Tokens(tt.input) + theirs := charmTokens(tt.input) + if reflect.DeepEqual(ours, theirs) { + t.Fatalf("expected divergence, but boundaries matched\nreason: %s\ntokens: %q", tt.reason, ours) + } + t.Logf("reason: %s", tt.reason) + t.Logf("ours: %q", ours) + t.Logf("theirs: %q", theirs) + }) + } +} + +// ansiSample builds a realistic ANSI-heavy string simulating colored terminal output. +func ansiSample() string { + var b strings.Builder + colors := []string{ + "\x1b[1;34m", // bold blue + "\x1b[0;32m", // green + "\x1b[0;36m", // cyan + "\x1b[1;31m", // bold red + "\x1b[33m", // yellow + } + reset := "\x1b[0m" + lines := []string{ + "drwxr-xr-x 5 user staff 160 Jan 1 12:00 Documents", + "drwxr-xr-x 3 user staff 96 Feb 2 09:30 Downloads", + "-rwxr-xr-x 1 user staff 8432 Mar 15 14:22 build.sh", + "lrwxr-xr-x 1 user staff 11 Apr 20 08:00 config", + "-rw-r--r-- 1 user staff 1024 May 5 16:45 README.md", + } + for round := 0; round < 40; round++ { + for i, line := range lines { + color := colors[i%len(colors)] + if i%5 == 0 { + b.WriteString("\x1b]0;terminal - round ") + b.WriteString(string(rune('0' + round%10))) + b.WriteString("\x07") + } + b.WriteString(color) + b.WriteString(line) + b.WriteString(reset) + b.WriteString("\n") + } + } + return b.String() +} + +func ansiSample8Bit() string { + var b strings.Builder + lines := []string{ + "drwxr-xr-x 5 user staff 160 Jan 1 12:00 Documents", + "drwxr-xr-x 3 user staff 96 Feb 2 09:30 Downloads", + "-rwxr-xr-x 1 user staff 8432 Mar 15 14:22 build.sh", + "lrwxr-xr-x 1 user staff 11 Apr 20 08:00 config", + "-rw-r--r-- 1 user staff 1024 May 5 16:45 README.md", + } + for round := 0; round < 40; round++ { + for i, line := range lines { + if i%5 == 0 { + b.WriteByte(0x9D) + b.WriteString("0;terminal - round ") + b.WriteString(string(rune('0' + round%10))) + b.WriteByte(0x07) + } + b.WriteByte(0x9B) + b.WriteString("1;3") + b.WriteString(string(rune('0' + (i % 8)))) + b.WriteByte('m') + b.WriteString(line) + b.WriteByte(0x9B) + b.WriteString("0m") + b.WriteString("\n") + } + } + return b.String() +} + +func ansiSampleMixed() string { + return ansiSample() + ansiSample8Bit() +} + +func BenchmarkAnsiIteration(b *testing.B) { + input7 := ansiSample() + input8 := ansiSample8Bit() + inputMixed := ansiSampleMixed() + + b.Run("clipperhouse/uax29/7bit", func(b *testing.B) { + b.SetBytes(int64(len(input7))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + g := graphemes.FromString(input7) + g.AnsiEscapeSequences = true + for g.Next() { + count++ + } + } + }) + + b.Run("clipperhouse/uax29/8bit", func(b *testing.B) { + b.SetBytes(int64(len(input8))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + g := graphemes.FromString(input8) + g.AnsiEscapeSequences8Bit = true + for g.Next() { + count++ + } + } + }) + + b.Run("clipperhouse/uax29/both", func(b *testing.B) { + b.SetBytes(int64(len(inputMixed))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + g := graphemes.FromString(inputMixed) + g.AnsiEscapeSequences = true + g.AnsiEscapeSequences8Bit = true + for g.Next() { + count++ + } + } + }) + + b.Run("charmbracelet/x/ansi/mixed", func(b *testing.B) { + b.SetBytes(int64(len(inputMixed))) + b.ReportAllocs() + b.ResetTimer() + for i := 0; i < b.N; i++ { + count := 0 + var state byte + remaining := inputMixed + for len(remaining) > 0 { + _, _, advance, newState := ansi.DecodeSequence(remaining, state, nil) + state = newState + remaining = remaining[advance:] + count++ + } + } + }) +} + +// uax29Tokens segments the input using our graphemes iterator with ANSI support. +func uax29Tokens(input string) []string { + iter := graphemes.FromString(input) + iter.AnsiEscapeSequences = true + iter.AnsiEscapeSequences8Bit = true + var tokens []string + for iter.Next() { + tokens = append(tokens, iter.Value()) + } + return tokens +} + +// charmTokens segments the input using charmbracelet/x/ansi's DecodeSequence. +func charmTokens(input string) []string { + var state byte + remaining := input + var tokens []string + for len(remaining) > 0 { + seq, _, n, newState := ansi.DecodeSequence(remaining, state, nil) + tokens = append(tokens, seq) + state = newState + remaining = remaining[n:] + } + return tokens +} diff --git a/graphemes/comparative/go.mod b/graphemes/comparative/go.mod index 570fcd1..33c156e 100644 --- a/graphemes/comparative/go.mod +++ b/graphemes/comparative/go.mod @@ -1,10 +1,18 @@ module github.com/clipperhouse/uax29/graphemes/comparative -go 1.18 +go 1.24.2 require ( + github.com/charmbracelet/x/ansi v0.11.6 github.com/clipperhouse/uax29/v2 v2.6.0 github.com/rivo/uniseg v0.4.7 ) +require ( + github.com/clipperhouse/displaywidth v0.9.0 // indirect + github.com/clipperhouse/stringish v0.1.1 // indirect + github.com/lucasb-eyer/go-colorful v1.3.0 // indirect + github.com/mattn/go-runewidth v0.0.19 // indirect +) + replace github.com/clipperhouse/uax29/v2 => ../../ diff --git a/graphemes/comparative/go.sum b/graphemes/comparative/go.sum index 9008848..df31c87 100644 --- a/graphemes/comparative/go.sum +++ b/graphemes/comparative/go.sum @@ -1,2 +1,12 @@ +github.com/charmbracelet/x/ansi v0.11.6 h1:GhV21SiDz/45W9AnV2R61xZMRri5NlLnl6CVF7ihZW8= +github.com/charmbracelet/x/ansi v0.11.6/go.mod h1:2JNYLgQUsyqaiLovhU2Rv/pb8r6ydXKS3NIttu3VGZQ= +github.com/clipperhouse/displaywidth v0.9.0 h1:Qb4KOhYwRiN3viMv1v/3cTBlz3AcAZX3+y9OLhMtAtA= +github.com/clipperhouse/displaywidth v0.9.0/go.mod h1:aCAAqTlh4GIVkhQnJpbL0T/WfcrJXHcj8C0yjYcjOZA= +github.com/clipperhouse/stringish v0.1.1 h1:+NSqMOr3GR6k1FdRhhnXrLfztGzuG+VuFDfatpWHKCs= +github.com/clipperhouse/stringish v0.1.1/go.mod h1:v/WhFtE1q0ovMta2+m+UbpZ+2/HEXNWYXQgCt4hdOzA= +github.com/lucasb-eyer/go-colorful v1.3.0 h1:2/yBRLdWBZKrf7gB40FoiKfAWYQ0lqNcbuQwVHXptag= +github.com/lucasb-eyer/go-colorful v1.3.0/go.mod h1:R4dSotOR9KMtayYi1e77YzuveK+i7ruzyGqttikkLy0= +github.com/mattn/go-runewidth v0.0.19 h1:v++JhqYnZuu5jSKrk9RbgF5v4CGUjqRfBm05byFGLdw= +github.com/mattn/go-runewidth v0.0.19/go.mod h1:XBkDxAl56ILZc9knddidhrOlY5R/pDhgLpndooCuJAs= github.com/rivo/uniseg v0.4.7 h1:WUdvkW8uEhrYfLC4ZzdpI2ztxP1I582+49Oc5Mq64VQ= github.com/rivo/uniseg v0.4.7/go.mod h1:FN3SvrM+Zdj16jyLfmOkMNblXMcoc8DfTHruCPUcx88= diff --git a/graphemes/fuzz_test.go b/graphemes/fuzz_test.go index d6b840a..9d39677 100644 --- a/graphemes/fuzz_test.go +++ b/graphemes/fuzz_test.go @@ -153,3 +153,73 @@ func FuzzInvalid(f *testing.F) { } }) } + +// FuzzANSIOptions fuzzes iterator roundtripping with ANSI options enabled. +// This specifically exercises 7-bit only, 8-bit only, and combined modes. +func FuzzANSIOptions(f *testing.F) { + if testing.Short() { + f.Skip("skipping fuzz test in short mode") + } + + seeds := [][]byte{ + []byte("\x1b[31mhello\x1b[0m"), // 7-bit CSI + []byte("\x1b]0;Title\x07"), // 7-bit OSC + BEL + []byte("\x1bPqpayload\x1b\\"), // 7-bit DCS + 7-bit ST + []byte("\x9B31mhello"), // C1 CSI + []byte("\x9D0;Title\x9C"), // C1 OSC + C1 ST + []byte("\x90qpayload\x9C"), // C1 DCS + C1 ST + []byte("\x98hello\x9C"), // C1 SOS + C1 ST + []byte("\x9Emsg\x9C"), // C1 PM + C1 ST + []byte("\x9Fdata\x9C"), // C1 APC + C1 ST + []byte("\x1b]0;Title\x9C"), // 7-bit initiator + C1 ST (strict negative) + []byte("\x9D0;Title\x1b\\"), // C1 initiator + 7-bit ST (strict negative) + []byte("\x1b]0;本\x07"), // UTF-8 in OSC payload + []byte("\x90q本\x9C"), // UTF-8 in C1 DCS payload + []byte("\x1b[31m\x9B1;32mtext\x1b[0m"), // mixed 7-bit + 8-bit CSI + []byte("\x1b"), // truncated ESC + []byte("\x9D0;unterminated"), // unterminated C1 OSC + []byte("plain UTF-8: café 日本語 👩🏽‍💻"), // non-ANSI UTF-8 + } + for _, s := range seeds { + f.Add(s) + } + + f.Fuzz(func(t *testing.T, original []byte) { + validOriginal := utf8.Valid(original) + + modes := []struct { + name string + ansi7Bit bool + ansi8Bit bool + }{ + {name: "off", ansi7Bit: false, ansi8Bit: false}, + {name: "7bit", ansi7Bit: true, ansi8Bit: false}, + {name: "8bit", ansi7Bit: false, ansi8Bit: true}, + {name: "both", ansi7Bit: true, ansi8Bit: true}, + } + + for _, mode := range modes { + tokens := graphemes.FromBytes(original) + tokens.AnsiEscapeSequences = mode.ansi7Bit + tokens.AnsiEscapeSequences8Bit = mode.ansi8Bit + + var all [][]byte + for tokens.Next() { + all = append(all, tokens.Value()) + } + + roundtrip := make([]byte, 0, len(original)) + for _, s := range all { + roundtrip = append(roundtrip, s...) + } + + if !bytes.Equal(roundtrip, original) { + t.Fatalf("%s mode: bytes did not roundtrip", mode.name) + } + + if validOriginal != utf8.Valid(roundtrip) { + t.Fatalf("%s mode: utf8 validity of original did not match roundtrip", mode.name) + } + } + }) +} diff --git a/graphemes/iterator.go b/graphemes/iterator.go index a734657..90d669a 100644 --- a/graphemes/iterator.go +++ b/graphemes/iterator.go @@ -27,9 +27,18 @@ type Iterator[T ~string | ~[]byte] struct { data T pos int start int - // AnsiEscapeSequences treats ANSI escape sequences (ECMA-48) as single grapheme - // clusters when true. Default is false. + // AnsiEscapeSequences treats 7-bit C0 ANSI escape sequences (ECMA-48) as + // single grapheme clusters when true. The default is false. + // + // 8-bit controls are not enabled by this option. See [AnsiEscapeSequences8Bit]. AnsiEscapeSequences bool + // AnsiEscapeSequences8Bit treats 8-bit C1 ANSI escape sequences (ECMA-48) as single + // grapheme clusters when true. The default is false. + // + // 8-bit control bytes are not UTF-8 encoded, i.e. not valid UTF-8. If you + // choose this option, you are choosing to interpret non-UTF-8 data, caveat + // emptor. + AnsiEscapeSequences8Bit bool } var ( @@ -38,12 +47,12 @@ var ( ) const ( - esc = 0x1B - cr = 0x0D - bel = 0x07 - can = 0x18 - sub = 0x1A - c1UTF8Lead = 0xC2 + esc = 0x1B + cr = 0x0D + bel = 0x07 + can = 0x18 + sub = 0x1A + st = 0x9C ) // Next advances the iterator to the next grapheme cluster. @@ -54,16 +63,21 @@ func (iter *Iterator[T]) Next() bool { } iter.start = iter.pos - if iter.AnsiEscapeSequences && (iter.data[iter.pos] == esc || iter.data[iter.pos] == c1UTF8Lead) { + b := iter.data[iter.pos] + if iter.AnsiEscapeSequences && b == esc { if a := ansiEscapeLength(iter.data[iter.pos:]); a > 0 { iter.pos += a return true } } + if iter.AnsiEscapeSequences8Bit && b >= 0x80 && b <= 0x9F { + if a := ansiEscapeLength8Bit(iter.data[iter.pos:]); a > 0 { + iter.pos += a + return true + } + } // ASCII hot path: any ASCII is one grapheme when next byte is ASCII or end. - // Fall through on CR so splitfunc can handle CR+LF as a single cluster. - b := iter.data[iter.pos] if b < utf8.RuneSelf && b != cr { if iter.pos+1 >= len(iter.data) || iter.data[iter.pos+1] < utf8.RuneSelf { iter.pos++