clipperhouse · clipperhouse · Feb 16, 2026 · Feb 15, 2026 · Feb 15, 2026 · Feb 15, 2026
diff --git a/.github/workflows/gofuzz.yml b/.github/workflows/gofuzz.yml
@@ -12,6 +12,9 @@ jobs:
         matrix:
           package: [words, sentences, graphemes, phrases]
           fuzzer: [FuzzValidShort, FuzzValidLong, FuzzInvalid]
+          include:
+            - package: graphemes
+              fuzzer: FuzzANSIOptions
     runs-on: ubuntu-latest
     steps:
     - name: Set up Go

diff --git a/graphemes/README.md b/graphemes/README.md
@@ -74,7 +74,7 @@ for g.Next() {                     // Next() returns true until end of data
 
 ### ANSI escape sequences
 
-By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat these sequences as a single cluster, set the `AnsiEscapeSequences` option to true.
+By the UAX 29 specification, ANSI escape sequences are not grapheme clusters. To treat 7-bit ANSI escape sequences as a single cluster, set `AnsiEscapeSequences` to true.
 
 ```go
 text := "Hello, \x1b[31mworld\x1b[0m!"
@@ -86,7 +86,17 @@ for g.Next() {
 }
 ```
 
-We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) C0 and C1 control codes, 7-bit and 8-bit, in UTF-8 encoding.
+To also parse 8-bit C1 controls (non-UTF-8 bytes), set `AnsiEscapeSequences8Bit` to true.
+
+```go
+g.AnsiEscapeSequences = true     // 7-bit forms (ESC ...)
+g.AnsiEscapeSequences8Bit = true // 8-bit C1 forms (0x80-0x9F), not valid UTF-8
+```
+
+For ESC-initiated (7-bit) control strings, only 7-bit terminators are recognized.
+For C1-initiated (8-bit) control strings, only C1 ST (`0x9C`) is recognized as ST.
+
+We implement [ECMA-48](https://ecma-international.org/publications-and-standards/standards/ecma-48/) control codes in both 7-bit and 8-bit representations. 8-bit control codes are not UTF-8 encoded and are not valid UTF-8, caveat emptor.
 
 ### Benchmarks
 

diff --git a/graphemes/ansi.go b/graphemes/ansi.go
@@ -1,108 +1,72 @@
 package graphemes
 
-// ansiEscapeLength returns the byte length of a valid ANSI escape/control
+// ansiEscapeLength returns the byte length of a valid 7-bit ANSI escape
 // sequence at the start of data, or 0 if none.
 //
-// Input is UTF-8. This recognizes both:
-//   - 7-bit representations (ESC + final/intermediate bytes), and
-//   - UTF-8 encodings of 8-bit C1 controls (U+0080..U+009F => 0xC2 0x80..0x9F).
-//
 // Recognized forms (ECMA-48 / ISO 6429):
-//   - CSI: ESC [ then parameter bytes (0x30–0x3F), intermediate (0x20–0x2F), final (0x40–0x7E)
-//   - OSC: ESC ] then payload until ST (ESC \) or BEL (0x07)
-//   - DCS, SOS, PM, APC: ESC P / X / ^ / _ then payload until ST (ESC \)
-//   - Two-byte: ESC + Fe/Fs (0x40–0x7E excluding above), or Fp (0x30–0x3F), or nF (0x20–0x2F then final)
+//   - CSI: ESC [ then parameter bytes (0x30-0x3F), intermediate (0x20-0x2F), final (0x40-0x7E)
+//   - OSC: ESC ] then payload until BEL (0x07), 7-bit ST (ESC \), CAN (0x18), or SUB (0x1A)
+//   - DCS, SOS, PM, APC: ESC P/X/^/_ then payload until 7-bit ST (ESC \), CAN, or SUB
+//   - Two-byte: ESC + Fe/Fs (0x40-0x7E excluding above), or Fp (0x30-0x3F), or nF (0x20-0x2F then final)
 func ansiEscapeLength[T ~string | ~[]byte](data T) int {
 	n := len(data)
-	if n < 2 {
+	if n < 2 || data[0] != esc {
 		return 0
 	}
 
-	switch data[0] {
-	case esc:
-		b1 := data[1]
-		switch b1 {
-		case '[': // CSI
-			body := csiLength(data[2:])
-			if body == 0 {
-				return 0
-			}
-			return 2 + body
-		case ']': // OSC – allows BEL or ST as terminator
-			body := oscLength(data[2:])
-			if body < 0 {
-				return 0
-			}
-			return 2 + body
-		case 'P', 'X', '^', '_': // DCS, SOS, PM, APC – require ST only
-			body := stSequenceLength(data[2:])
-			if body < 0 {
-				return 0
-			}
-			return 2 + body
-		}
-		if b1 >= 0x40 && b1 <= 0x7E {
-			// Fe/Fs two-byte; [ ] P X ^ _ handled above
-			return 2
-		}
-		if b1 >= 0x30 && b1 <= 0x3F {
-			// Fp (private) two-byte
-			return 2
+	b1 := data[1]
+	switch b1 {
+	case '[': // CSI
+		body := csiBodyLength(data[2:])
+		if body == 0 {
+			return 0
 		}
-		if b1 >= 0x20 && b1 <= 0x2F {
-			// nF: intermediates then one final (0x30–0x7E)
-			i := 2
-			for i < n && data[i] >= 0x20 && data[i] <= 0x2F {
-				i++
-			}
-			if i < n && data[i] >= 0x30 && data[i] <= 0x7E {
-				return i + 1
-			}
+		return 2 + body
+	case ']': // OSC - allows BEL or 7-bit ST terminator
+		body := oscLength(data[2:])
+		if body < 0 {
 			return 0
 		}
-
-	case c1UTF8Lead:
-		b1 := data[1]
-		if b1 < 0x80 || b1 > 0x9F {
+		return 2 + body
+	case 'P', 'X', '^', '_': // DCS, SOS, PM, APC
+		body := stSequenceLength(data[2:])
+		if body < 0 {
 			return 0
 		}
+		return 2 + body
+	}
 
-		switch b1 {
-		case 0x9B: // CSI
-			body := csiLength(data[2:])
-			if body == 0 {
-				return 0
-			}
-			return 2 + body
-		case 0x9D: // OSC – allows BEL or ST as terminator
-			body := oscLength(data[2:])
-			if body < 0 {
-				return 0
-			}
-			return 2 + body
-		case 0x90, 0x98, 0x9E, 0x9F: // DCS, SOS, PM, APC – require ST only
-			body := stSequenceLength(data[2:])
-			if body < 0 {
-				return 0
-			}
-			return 2 + body
-		default:
-			// Any other C1 control (UTF-8 encoded) is one control sequence token.
-			return 2
+	if b1 >= 0x40 && b1 <= 0x7E {
+		// Fe/Fs two-byte; [ ] P X ^ _ handled above
+		return 2
+	}
+	if b1 >= 0x30 && b1 <= 0x3F {
+		// Fp (private) two-byte
+		return 2
+	}
+	if b1 >= 0x20 && b1 <= 0x2F {
+		// nF: intermediates then one final (0x30-0x7E)
+		i := 2
+		for i < n && data[i] >= 0x20 && data[i] <= 0x2F {
+			i++
 		}
+		if i < n && data[i] >= 0x30 && data[i] <= 0x7E {
+			return i + 1
+		}
+		return 0
 	}
 
 	return 0
 }
 
-// csiLength returns the length of the CSI body (param/intermediate/final bytes).
+// csiBodyLength returns the length of the CSI body (param/intermediate/final bytes).
 // data is the slice after "ESC [".
 // Per ECMA-48, the CSI body has the form:
 //
 //	parameters (0x30–0x3F)*, intermediates (0x20–0x2F)*, final (0x40–0x7E)
 //
 // Once an intermediate byte is seen, subsequent parameter bytes are invalid.
-func csiLength[T ~string | ~[]byte](data T) int {
+func csiBodyLength[T ~string | ~[]byte](data T) int {
 	seenIntermediate := false
 	for i := 0; i < len(data); i++ {
 		b := data[i]
@@ -125,13 +89,13 @@ func csiLength[T ~string | ~[]byte](data T) int {
 }
 
 // oscLength returns the length of the OSC body.
-// data is the slice after "ESC ]" (or C1 OSC).
+// data is the slice after "ESC ]".
 //
 // Returns:
 //   - n >= 0: consumed body length (includes BEL/ST terminator when present)
 //   - -1: not terminated in the provided data
 //
-// OSC accepts BEL (0x07) or ST as terminator by widespread convention.
+// OSC accepts BEL (0x07) or 7-bit ST (ESC \) as terminators by widespread convention.
 // Per ECMA-48, CAN (0x18) and SUB (0x1A) cancel the control string; in that
 // case they are not part of the OSC sequence length.
 func oscLength[T ~string | ~[]byte](data T) int {
@@ -146,21 +110,19 @@ func oscLength[T ~string | ~[]byte](data T) int {
 		if b == esc && i+1 < len(data) && data[i+1] == '\\' {
 			return i + 2
 		}
-		if b == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C {
-			return i + 2
-		}
 	}
 	return -1
 }
 
 // stSequenceLength returns the length of a control-string body.
-// data is the slice after "ESC x" (or C1 DCS/SOS/PM/APC).
+// data is the slice after "ESC x".
 //
 // Returns:
 //   - n >= 0: consumed body length (includes ST terminator when present)
 //   - -1: not terminated in the provided data
 //
 // Used for DCS, SOS, PM, and APC, which per ECMA-48 terminate with ST.
+// ST here is the 7-bit form (ESC \).
 // CAN (0x18) and SUB (0x1A) cancel the control string; in that case they are
 // not part of the sequence length.
 func stSequenceLength[T ~string | ~[]byte](data T) int {
@@ -171,9 +133,6 @@ func stSequenceLength[T ~string | ~[]byte](data T) int {
 		if data[i] == esc && i+1 < len(data) && data[i+1] == '\\' {
 			return i + 2
 		}
-		if data[i] == c1UTF8Lead && i+1 < len(data) && data[i+1] == 0x9C {
-			return i + 2
-		}
 	}
 	return -1
 }
diff --git a/graphemes/ansi8.go b/graphemes/ansi8.go
@@ -0,0 +1,79 @@
+package graphemes
+
+// ansiEscapeLength8Bit returns the byte length of a valid 8-bit C1 ANSI
+// sequence at the start of data, or 0 if none.
+//
+// Recognized forms (ECMA-48 / ISO 6429):
+//   - C1 CSI (0x9B) body as parameter/intermediate/final bytes
+//   - C1 OSC (0x9D) body terminated by BEL, C1 ST, CAN, or SUB
+//   - C1 DCS/SOS/PM/APC (0x90/0x98/0x9E/0x9F) body terminated by C1 ST, CAN, or SUB
+//   - Standalone C1 controls (0x80..0x9F not listed above): single byte
+func ansiEscapeLength8Bit[T ~string | ~[]byte](data T) int {
+	if len(data) == 0 {
+		return 0
+	}
+
+	switch data[0] {
+	case 0x9B: // C1 CSI
+		body := csiBodyLength(data[1:])
+		if body == 0 {
+			return 0
+		}
+		return 1 + body
+	case 0x9D: // C1 OSC
+		body := oscLengthC1(data[1:])
+		if body < 0 {
+			return 0
+		}
+		return 1 + body
+	case 0x90, 0x98, 0x9E, 0x9F: // C1 DCS, SOS, PM, APC
+		body := stSequenceLengthC1(data[1:])
+		if body < 0 {
+			return 0
+		}
+		return 1 + body
+	default:
+		if data[0] >= 0x80 && data[0] <= 0x9F {
+			return 1
+		}
+	}
+
+	return 0
+}
+
+// oscLengthC1 returns the length of a C1 OSC body.
+// data is the slice after the C1 OSC initiator (0x9D).
+//
+// Returns:
+//   - n >= 0: consumed body length (includes BEL/ST terminator when present)
+//   - -1: not terminated in the provided data
+//
+// Terminators: BEL (0x07) or C1 ST (0x9C).
+// CAN (0x18) and SUB (0x1A) cancel the control string.
+func oscLengthC1[T ~string | ~[]byte](data T) int {
+	for i := 0; i < len(data); i++ {
+		b := data[i]
+		if b == bel || b == st {
+			return i + 1
+		}
+		if b == can || b == sub {
+			return i
+		}
+	}
+	return -1
+}
+
+// stSequenceLengthC1 parses DCS/SOS/PM/APC bodies that terminate with C1 ST
+// (0x9C), or are canceled by CAN/SUB.
+func stSequenceLengthC1[T ~string | ~[]byte](data T) int {
+	for i := 0; i < len(data); i++ {
+		b := data[i]
+		if b == can || b == sub {
+			return i
+		}
+		if b == st {
+			return i + 1
+		}
+	}
+	return -1
+}