diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 656c70c..b16a543 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -24,6 +24,9 @@ jobs: test: name: Test - ${{ matrix.os }} - Go ${{ matrix.go-version }} runs-on: ${{ matrix.os }} + permissions: + contents: read + id-token: write # Required for Codecov OIDC (tokenless upload) strategy: matrix: os: [ubuntu-latest, macos-latest, windows-latest] @@ -57,7 +60,8 @@ jobs: uses: codecov/codecov-action@v5 continue-on-error: true with: - file: ./coverage.txt + use_oidc: true + files: ./coverage.txt flags: unittests name: codecov-uniwidth fail_ci_if_error: false diff --git a/CHANGELOG.md b/CHANGELOG.md index 478c207..0a83bdb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,11 +8,14 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] ### Planned +- Non-ASCII StringWidth path optimization (reduce ZWJ overhead for non-emoji) - Profile-Guided Optimization (PGO) support -- Unicode 17.0 preparation -- Benchmark CI for regression detection +- Unicode 17.0 tables - Explicit SIMD via Go assembly and `archsimd` (Go 1.26+) -- API stability review based on community feedback + +### Added +- **100% test coverage**: Exhaustive branch coverage for `isExtendedPictographic()` (all Unicode ranges) and `asciiWidth()` (SWAR fast/slow paths, control chars at every byte offset). +- **Benchmark CI**: Automated regression detection (benchstat) and three-way library comparison table in PR comments. ## [0.2.0] - 2026-02-05 @@ -32,7 +35,7 @@ Major performance and emoji correctness release. All four lookup tiers are now O - **ASCII detection**: SWAR `isASCIIOnly()` processes 8 bytes/iter via uint64 word with `0x8080808080808080` mask. No unsafe pointer escapes. - **ASCII width counting**: SWAR `asciiWidth()` uses Daniel Lemire's underflow trick for control character detection in 8-byte chunks. - **Short string optimization**: Strings < 8 bytes use a fused single-pass loop that combines ASCII check and width counting, avoiding SWAR function call overhead. -- **Test coverage**: 87.1% → 96.4% (+9.3%). +- **Test coverage**: 87.1% → 100% (library package). ### Performance - **ASCII**: 3-46x faster than go-runewidth (SWAR fast paths) diff --git a/README.md b/README.md index da7605a..3964bd5 100644 --- a/README.md +++ b/README.md @@ -240,7 +240,7 @@ go test -bench=. -benchmem go test -cover ``` -Current test coverage: **96.4%** +Current test coverage: **100%** (library package) ## Development Status @@ -253,13 +253,14 @@ Current test coverage: **96.4%** - SWAR ASCII optimization (8 bytes/iter) - ZWJ emoji state machine (👨‍👩‍👧‍👦 = width 2) - Emoji modifier support (👍🏽 = width 2) -- 96.4% test coverage +- 100% test coverage (library package) +- Automated benchmark CI (regression detection + library comparison) **Roadmap** (v0.3.0+): +- Non-ASCII StringWidth path optimization - Profile-Guided Optimization (PGO) -- Benchmark CI for regression detection - Explicit SIMD via Go assembly and `archsimd` -- Unicode 17.0 preparation +- Unicode 17.0 tables ## Contributing diff --git a/codecov.yml b/codecov.yml new file mode 100644 index 0000000..ba14eaf --- /dev/null +++ b/codecov.yml @@ -0,0 +1,11 @@ +coverage: + status: + project: + default: + target: 90% + patch: + default: + target: 80% + +ignore: + - "cmd/**" diff --git a/options.go b/options.go index bfd7cab..02c6a37 100644 --- a/options.go +++ b/options.go @@ -253,7 +253,7 @@ func runeWidthInternal(r rune) int { // // Performance: O(1), 0 allocations. func tableLookupWidthInternal(r rune) int { - cp := uint32(r) + cp := uint32(r) //nolint:gosec // G115: rune is int32; valid Unicode codepoints (0–0x10FFFF) are always non-negative rootIdx := widthRoot[cp>>13] midIdx := widthMiddle[rootIdx][cp>>7&0x3F] packed := widthLeaves[midIdx][cp>>2&0x1F] diff --git a/uniwidth.go b/uniwidth.go index 58d28f4..970b815 100644 --- a/uniwidth.go +++ b/uniwidth.go @@ -523,7 +523,7 @@ func asciiWidth(s string) int { // // Performance: O(1), 0 allocations. Three array lookups + bit extraction. func tableLookupWidth(r rune) int { - cp := uint32(r) + cp := uint32(r) //nolint:gosec // G115: rune is int32; valid Unicode codepoints (0–0x10FFFF) are always non-negative rootIdx := widthRoot[cp>>13] midIdx := widthMiddle[rootIdx][cp>>7&0x3F] packed := widthLeaves[midIdx][cp>>2&0x1F] diff --git a/uniwidth_test.go b/uniwidth_test.go index 689554c..daba7d9 100644 --- a/uniwidth_test.go +++ b/uniwidth_test.go @@ -968,3 +968,202 @@ func TestRuneWidth_UncommonRanges(t *testing.T) { }) } } + +// ============================================================================= +// isExtendedPictographic — exhaustive branch coverage for all Unicode ranges +// ============================================================================= + +func TestIsExtendedPictographic_AllRanges(t *testing.T) { + tests := []struct { + name string + r rune + want bool + }{ + // Range: Misc Symbols and Arrows (U+2B00-U+2BFF) + {"Misc Symbols/Arrows start U+2B00", 0x2B00, true}, + {"Up arrow U+2B06", 0x2B06, true}, + {"Star U+2B50", 0x2B50, true}, + {"Misc Symbols/Arrows end U+2BFF", 0x2BFF, true}, + {"Below Misc Symbols/Arrows U+2AFF", 0x2AFF, false}, + + // Range: Arrow symbols (U+2194-U+21AA) + {"Left-right arrow U+2194", 0x2194, true}, + {"Rightwards arrow with hook U+21AA", 0x21AA, true}, + {"Mid arrow range U+219E", 0x219E, true}, + {"Below arrow range U+2193", 0x2193, false}, + + // Range: Geometric Shapes (U+25A0-U+25FF) + {"Black square U+25A0", 0x25A0, true}, + {"White circle U+25CB", 0x25CB, true}, + {"Geometric end U+25FF", 0x25FF, true}, + {"Below geometric range U+259F", 0x259F, false}, + + // Range: Legacy Computing (U+1FB00-U+1FFFD) + {"Legacy Computing start U+1FB00", 0x1FB00, true}, + {"Legacy Computing mid U+1FC00", 0x1FC00, true}, + {"Legacy Computing end U+1FFFD", 0x1FFFD, true}, + {"Above legacy range U+1FFFE", 0x1FFFE, false}, + + // Verify existing ranges still work + {"SMP emoji start U+1F000", 0x1F000, true}, + {"SMP emoji end U+1FAFF", 0x1FAFF, true}, + {"Misc Symbols start U+2600", 0x2600, true}, + {"Dingbats end U+27BF", 0x27BF, true}, + {"Misc Technical start U+2300", 0x2300, true}, + {"Misc Technical end U+23FF", 0x23FF, true}, + + // Individual EP characters (switch statement) + {"Double exclamation U+203C", 0x203C, true}, + {"Exclamation question U+2049", 0x2049, true}, + {"Info source U+2139", 0x2139, true}, + {"Wavy dash U+3030", 0x3030, true}, + {"Part alternation U+303D", 0x303D, true}, + {"Circled congratulation U+3297", 0x3297, true}, + {"Circled secret U+3299", 0x3299, true}, + + // Negatives: characters in gaps between EP ranges + {"After arrow, before misc tech U+21AB", 0x21AB, false}, + {"Control Pictures block U+2400", 0x2400, false}, + {"Box Drawing block U+2500", 0x2500, false}, + {"After dingbats U+27C0", 0x27C0, false}, + {"BMP high U+FFFF", 0xFFFF, false}, + {"Regular Latin", 'Z', false}, + {"Null character", 0x0000, false}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := isExtendedPictographic(tt.r) + if got != tt.want { + t.Errorf("isExtendedPictographic(%U) = %v, want %v", tt.r, got, tt.want) + } + }) + } +} + +// ============================================================================= +// asciiWidth — direct unit tests for SWAR control character detection +// ============================================================================= + +func TestAsciiWidth(t *testing.T) { + tests := []struct { + name string + s string + want int + }{ + // Empty string (n == 0 early return) + {"empty string", "", 0}, + + // Short strings (< 8 bytes, scalar tail only) + {"single printable", "A", 1}, + {"7 printable chars", "abcdefg", 7}, + + // Exactly 8 bytes (one SWAR chunk, no tail) + {"8 printable chars", "abcdefgh", 8}, + {"8 spaces (0x20 boundary)", " ", 8}, + {"8 tilde (0x7E boundary)", "~~~~~~~~", 8}, + + // Longer strings (multiple SWAR chunks + tail) + {"16 chars (2 chunks)", "0123456789abcdef", 16}, + {"17 chars (2 chunks + 1 tail)", "0123456789abcdefg", 17}, + {"24 chars (3 chunks)", "abcdefghijklmnopqrstuvwx", 24}, + + // SWAR slow path: control character forces byte-by-byte fallback + {"null in 8-byte chunk", "abcd\x00efg", 7}, + {"tab in 8-byte chunk", "abcdefg\t", 7}, + {"mixed CR/LF in chunk", "abc\ndef\rg", 7}, + {"DEL (0x7F) in chunk", "abcdefg\x7F", 7}, + {"BEL (0x07) in chunk", "abc\x07defg", 7}, + {"multiple controls in chunk", "\t\n\r\x00ABCD", 4}, + {"all control chars in chunk", "\x00\x01\x02\x03\x04\x05\x06\x07", 0}, + + // Mixed fast path + slow path across chunks + {"ctrl first chunk, clean second", "\x01bcdefghijklmnop", 15}, + {"clean first chunk, ctrl second", "abcdefgh\x00jklmnop", 15}, + {"ctrl in both chunks", "\x01bcdefgh\x02jklmnop", 14}, + + // Scalar tail with control characters + {"9 chars with tab at tail", "abcdefgh\t", 8}, + {"10 chars with null at tail", "abcdefghi\x00", 9}, + {"15 chars with DEL at tail", "abcdefghijklmn\x7F", 14}, + + // Printable boundary: 0x1F is control, 0x20 is printable + {"unit separator 0x1F in chunk", "abcdefg\x1F", 7}, + {"space 0x20 in chunk", "abcdefg ", 8}, + + // Realistic TUI content (8+ bytes ASCII with mixed content) + {"terminal prompt", "user@host:~$ ", 13}, + {"table separator", "+--------+--------+", 19}, + {"progress bar", "[=====> ]", 18}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := asciiWidth(tt.s) + if got != tt.want { + t.Errorf("asciiWidth(%q) = %d, want %d", tt.s, got, tt.want) + } + }) + } +} + +// TestAsciiWidth_SWARControlAtEveryPosition exercises the SWAR slow path +// with a control character placed at each byte offset within an 8-byte chunk, +// ensuring correct byte-by-byte fallback regardless of position. +func TestAsciiWidth_SWARControlAtEveryPosition(t *testing.T) { + // SOH (0x01) at each position in an 8-byte chunk + for pos := 0; pos < 8; pos++ { + s := []byte("ABCDEFGH") + s[pos] = 0x01 + t.Run("soh_at_"+string(rune('0'+pos)), func(t *testing.T) { + got := asciiWidth(string(s)) + if got != 7 { + t.Errorf("asciiWidth(%q) = %d, want 7 (SOH at pos %d)", s, got, pos) + } + }) + } + + // DEL (0x7F) at each position in an 8-byte chunk + for pos := 0; pos < 8; pos++ { + s := []byte("ABCDEFGH") + s[pos] = 0x7F + t.Run("del_at_"+string(rune('0'+pos)), func(t *testing.T) { + got := asciiWidth(string(s)) + if got != 7 { + t.Errorf("asciiWidth(%q) = %d, want 7 (DEL at pos %d)", s, got, pos) + } + }) + } +} + +// TestStringWidth_ASCIIControlMix verifies that StringWidth correctly delegates +// to asciiWidth for long ASCII strings with embedded control characters. +func TestStringWidth_ASCIIControlMix(t *testing.T) { + tests := []struct { + name string + s string + want int + }{ + // 8+ byte ASCII strings that hit the isASCIIOnly→asciiWidth path + {"16 printable", "Hello, World!!! ", 16}, + {"tab in long string", "Hello\tWorld!!!", 13}, + {"newline in long string", "Hello\nWorld!!!", 13}, + {"DEL in long string", "Hello\x7FWorld!!!", 13}, + {"multiple newlines", "line1\nline2\nline3\n", 15}, + {"null bytes scattered", "abc\x00defg\x00ijklmnop", 15}, + {"TUI box drawing ASCII", "+----------+----------+", 23}, + + // Boundary: exactly 8 bytes + {"8 bytes all printable", "12345678", 8}, + {"8 bytes with tab", "1234567\t", 7}, + } + + for _, tt := range tests { + t.Run(tt.name, func(t *testing.T) { + got := StringWidth(tt.s) + if got != tt.want { + t.Errorf("StringWidth(%q) = %d, want %d", tt.s, got, tt.want) + } + }) + } +}