Skip to content
Merged
3 changes: 3 additions & 0 deletions .github/workflows/gofuzz.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@ jobs:
matrix:
package: [words, sentences, graphemes, phrases]
fuzzer: [FuzzValidShort, FuzzValidLong, FuzzInvalid]
include:
- package: graphemes
fuzzer: FuzzANSIOptions
runs-on: ubuntu-latest
steps:
- name: Set up Go
Expand Down
14 changes: 12 additions & 2 deletions graphemes/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ for g.Next() { // Next() returns true until end of data

### ANSI escape sequences

By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat these sequences as a single cluster, set the `AnsiEscapeSequences` option to true.
By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat 7-bit ANSI escape sequences as a single cluster, set `AnsiEscapeSequences` to true.

```go
text := "Hello, \x1b[31mworld\x1b[0m!"
Expand All @@ -86,7 +86,17 @@ for g.Next() {
}
```

We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) C0 and C1 control codes, 7-bit and 8-bit, in UTF-8 encoding.
To also parse 8-bit C1 controls (non-UTF-8 bytes), set `AnsiEscapeSequences8Bit` to true.

```go
g.AnsiEscapeSequences = true // 7-bit forms (ESC ...)
g.AnsiEscapeSequences8Bit = true // 8-bit C1 forms (0x80-0x9F), not valid UTF-8
```

For ESC-initiated (7-bit) control strings, only 7-bit terminators are recognized.
For C1-initiated (8-bit) control strings, only C1 ST (`0x9C`) is recognized as ST.

We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) control codes in both 7-bit and 8-bit representations. 8-bit control codes are not UTF-8 encoded and are not valid UTF-8, caveat emptor.

### Benchmarks

Expand Down
131 changes: 45 additions & 86 deletions graphemes/ansi.go
Original file line number Diff line number Diff line change
@@ -1,108 +1,72 @@
package graphemes

// ansiEscapeLength returns the byte length of a valid ANSI escape/control
// ansiEscapeLength returns the byte length of a valid 7-bit ANSI escape
// sequence at the start of data, or 0 if none.
//
// Input is UTF-8. This recognizes both:
// - 7-bit representations (ESC + final/intermediate bytes), and
// - UTF-8 encodings of 8-bit C1 controls (U+0080..U+009F => 0xC2 0x80..0x9F).
//
// Recognized forms (ECMA-48 / ISO 6429):
// - CSI: ESC [ then parameter bytes (0x300x3F), intermediate (0x200x2F), final (0x400x7E)
// - OSC: ESC ] then payload until ST (ESC \) or BEL (0x07)
// - DCS, SOS, PM, APC: ESC P / X / ^ / _ then payload until ST (ESC \)
// - Two-byte: ESC + Fe/Fs (0x400x7E excluding above), or Fp (0x300x3F), or nF (0x200x2F then final)
// - CSI: ESC [ then parameter bytes (0x30-0x3F), intermediate (0x20-0x2F), final (0x40-0x7E)
// - OSC: ESC ] then payload until BEL (0x07), 7-bit ST (ESC \), CAN (0x18), or SUB (0x1A)
// - DCS, SOS, PM, APC: ESC P/X/^/_ then payload until 7-bit ST (ESC \), CAN, or SUB
// - Two-byte: ESC + Fe/Fs (0x40-0x7E excluding above), or Fp (0x30-0x3F), or nF (0x20-0x2F then final)
func ansiEscapeLength[T ~string | ~[]byte](data T) int {
n := len(data)
if n < 2 {
if n < 2 || data[0] != esc {
return 0
}

switch data[0] {
case esc:
b1 := data[1]
switch b1 {
case '[': // CSI
body := csiLength(data[2:])
if body == 0 {
return 0
}
return 2 + body
case ']': // OSC – allows BEL or ST as terminator
body := oscLength(data[2:])
if body < 0 {
return 0
}
return 2 + body
case 'P', 'X', '^', '_': // DCS, SOS, PM, APC – require ST only
body := stSequenceLength(data[2:])
if body < 0 {
return 0
}
return 2 + body
}
if b1 >= 0x40 && b1 <= 0x7E {
// Fe/Fs two-byte; [ ] P X ^ _ handled above
return 2
}
if b1 >= 0x30 && b1 <= 0x3F {
// Fp (private) two-byte
return 2
b1 := data[1]
switch b1 {
case '[': // CSI
body := csiBodyLength(data[2:])
if body == 0 {
return 0
}
if b1 >= 0x20 && b1 <= 0x2F {
// nF: intermediates then one final (0x30–0x7E)
i := 2
for i < n && data[i] >= 0x20 && data[i] <= 0x2F {
i++
}
if i < n && data[i] >= 0x30 && data[i] <= 0x7E {
return i + 1
}
return 2 + body
case ']': // OSC - allows BEL or 7-bit ST terminator
body := oscLength(data[2:])
if body < 0 {
return 0
}

case c1UTF8Lead:
b1 := data[1]
if b1 < 0x80 || b1 > 0x9F {
return 2 + body
case 'P', 'X', '^', '_': // DCS, SOS, PM, APC
body := stSequenceLength(data[2:])
if body < 0 {
return 0
}
return 2 + body
}

switch b1 {
case 0x9B: // CSI
body := csiLength(data[2:])
if body == 0 {
return 0
}
return 2 + body
case 0x9D: // OSC – allows BEL or ST as terminator
body := oscLength(data[2:])
if body < 0 {
return 0
}
return 2 + body
case 0x90, 0x98, 0x9E, 0x9F: // DCS, SOS, PM, APC – require ST only
body := stSequenceLength(data[2:])
if body < 0 {
return 0
}
return 2 + body
default:
// Any other C1 control (UTF-8 encoded) is one control sequence token.
return 2
if b1 >= 0x40 && b1 <= 0x7E {
// Fe/Fs two-byte; [ ] P X ^ _ handled above
return 2
}
if b1 >= 0x30 && b1 <= 0x3F {
// Fp (private) two-byte
return 2
}
if b1 >= 0x20 && b1 <= 0x2F {
// nF: intermediates then one final (0x30-0x7E)
i := 2
for i < n && data[i] >= 0x20 && data[i] <= 0x2F {
i++
}
if i < n && data[i] >= 0x30 && data[i] <= 0x7E {
return i + 1
}
return 0
}

return 0
}

// csiLength returns the length of the CSI body (param/intermediate/final bytes).
// csiBodyLength returns the length of the CSI body (param/intermediate/final bytes).
// data is the slice after "ESC [".
// Per ECMA-48, the CSI body has the form:
//
// parameters (0x30–0x3F)*, intermediates (0x20–0x2F)*, final (0x40–0x7E)
//
// Once an intermediate byte is seen, subsequent parameter bytes are invalid.
func csiLength[T ~string | ~[]byte](data T) int {
func csiBodyLength[T ~string | ~[]byte](data T) int {
seenIntermediate := false
for i := 0; i < len(data); i++ {
b := data[i]
Expand All @@ -125,13 +89,13 @@ func csiLength[T ~string | ~[]byte](data T) int {
}

// oscLength returns the length of the OSC body.
// data is the slice after "ESC ]" (or C1 OSC).
// data is the slice after "ESC ]".
//
// Returns:
// - n >= 0: consumed body length (includes BEL/ST terminator when present)
// - -1: not terminated in the provided data
//
// OSC accepts BEL (0x07) or ST as terminator by widespread convention.
// OSC accepts BEL (0x07) or 7-bit ST (ESC \) as terminators by widespread convention.
// Per ECMA-48, CAN (0x18) and SUB (0x1A) cancel the control string; in that
// case they are not part of the OSC sequence length.
func oscLength[T ~string | ~[]byte](data T) int {
Expand All @@ -146,21 +110,19 @@ func oscLength[T ~string | ~[]byte](data T) int {
if b == esc && i+1 < len(data) && data[i+1] == '\\' {
return i + 2
}
if b == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C {
return i + 2
}
}
return -1
}

// stSequenceLength returns the length of a control-string body.
// data is the slice after "ESC x" (or C1 DCS/SOS/PM/APC).
// data is the slice after "ESC x".
//
// Returns:
// - n >= 0: consumed body length (includes ST terminator when present)
// - -1: not terminated in the provided data
//
// Used for DCS, SOS, PM, and APC, which per ECMA-48 terminate with ST.
// ST here is the 7-bit form (ESC \).
// CAN (0x18) and SUB (0x1A) cancel the control string; in that case they are
// not part of the sequence length.
func stSequenceLength[T ~string | ~[]byte](data T) int {
Expand All @@ -171,9 +133,6 @@ func stSequenceLength[T ~string | ~[]byte](data T) int {
if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' {
return i + 2
}
if data[i] == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C {
return i + 2
}
}
return -1
}
79 changes: 79 additions & 0 deletions graphemes/ansi8.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
package graphemes

// ansiEscapeLength8Bit returns the byte length of a valid 8-bit C1 ANSI
// sequence at the start of data, or 0 if none.
//
// Recognized forms (ECMA-48 / ISO 6429):
// - C1 CSI (0x9B) body as parameter/intermediate/final bytes
// - C1 OSC (0x9D) body terminated by BEL, C1 ST, CAN, or SUB
// - C1 DCS/SOS/PM/APC (0x90/0x98/0x9E/0x9F) body terminated by C1 ST, CAN, or SUB
// - Standalone C1 controls (0x80..0x9F not listed above): single byte
func ansiEscapeLength8Bit[T ~string | ~[]byte](data T) int {
if len(data) == 0 {
return 0
}

switch data[0] {
case 0x9B: // C1 CSI
body := csiBodyLength(data[1:])
if body == 0 {
return 0
}
return 1 + body
case 0x9D: // C1 OSC
body := oscLengthC1(data[1:])
if body < 0 {
return 0
}
return 1 + body
case 0x90, 0x98, 0x9E, 0x9F: // C1 DCS, SOS, PM, APC
body := stSequenceLengthC1(data[1:])
if body < 0 {
return 0
}
return 1 + body
default:
if data[0] >= 0x80 && data[0] <= 0x9F {
return 1
}
}

return 0
}

// oscLengthC1 returns the length of a C1 OSC body.
// data is the slice after the C1 OSC initiator (0x9D).
//
// Returns:
// - n >= 0: consumed body length (includes BEL/ST terminator when present)
// - -1: not terminated in the provided data
//
// Terminators: BEL (0x07) or C1 ST (0x9C).
// CAN (0x18) and SUB (0x1A) cancel the control string.
func oscLengthC1[T ~string | ~[]byte](data T) int {
for i := 0; i < len(data); i++ {
b := data[i]
if b == bel || b == st {
return i + 1
}
if b == can || b == sub {
return i
}
}
return -1
}

// stSequenceLengthC1 parses DCS/SOS/PM/APC bodies that terminate with C1 ST
// (0x9C), or are canceled by CAN/SUB.
func stSequenceLengthC1[T ~string | ~[]byte](data T) int {
for i := 0; i < len(data); i++ {
b := data[i]
if b == can || b == sub {
return i
}
if b == st {
return i + 1
}
}
return -1
}
Loading
Loading