Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 6 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@ All notable changes to this project will be documented in this file. This projec

### Fixed

- Grapheme cursor now implements UAX #29 GB9c (Indic_Conjunct_Break, Unicode 15.1). Sequences like Devanagari KA + VIRAMA + TA are now correctly treated as a single cluster. Caught by the new GraphemeBreakTest.txt conformance runner.


### Added

Expand Down Expand Up @@ -34,8 +36,11 @@ All notable changes to this project will be documented in this file. This projec
- `Trim` primitive — `trim`, `trim_start`, `trim_end` over the White_Space property
- `Replace` primitive — `all`, `first`
- Error types: `InvalidUtf8(offset)`, `OutOfRange(index, size)`, `InvalidScalar(value)`
- `IndicConjunctBreak` closed union (`InCBNone` / `InCBConsonant` / `InCBLinker` / `InCBExtend`) and `_UcdIndicConjunctBreak` lookup table; surfaced via `Codepoints.indic_conjunct_break`
- `make conform`: NormalizationTest.txt Part 2 — for every assigned cp not in @Part1 of the test file, verify X == NFC(X) == NFD(X) == NFKC(X) == NFKD(X)
- `make conform-grapheme`: UAX #29 GraphemeBreakTest.txt conformance (1,093 cases including GB9c)
- 146 PonyCheck unit tests
- GitHub Actions CI: pr workflow with lint, changelog verify, and full UAX #15 conformance
- GitHub Actions CI: pr workflow with lint, changelog verify, and full normalization + grapheme conformance

### Changed

14 changes: 12 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ SOURCE_FILES := $(shell find $(SRC_DIR) -name '*.pony')

all: ci

ci: unit-tests conform
ci: unit-tests conform conform-grapheme

test: unit-tests

Expand Down Expand Up @@ -76,7 +76,7 @@ $(BUILD_DIR):
dependencies: corral.json
$(GET_DEPENDENCIES_WITH)

.PHONY: all ci test unit-tests clean realclean dependencies docs ucd-build ucd-generate ucd-download conform conform-build
.PHONY: all ci test unit-tests clean realclean dependencies docs ucd-build ucd-generate ucd-download conform conform-build conform-grapheme conform-grapheme-build

.DEFAULT_GOAL := all

Expand Down Expand Up @@ -145,3 +145,13 @@ $(conform_binary): $(SOURCE_FILES) $(shell find unicode_conform_main -name '*.po

conform: $(conform_binary)
$(conform_binary) $(UCD_DIR)/NormalizationTest.txt

conform_grapheme_binary := $(BUILD_DIR)/unicode_grapheme_conform_main

conform-grapheme-build: $(conform_grapheme_binary)

$(conform_grapheme_binary): $(SOURCE_FILES) $(shell find unicode_grapheme_conform_main -name '*.pony' 2>/dev/null) | $(BUILD_DIR) dependencies
$(PONYC) -o $(BUILD_DIR) unicode_grapheme_conform_main -b unicode_grapheme_conform_main

conform-grapheme: $(conform_grapheme_binary)
$(conform_grapheme_binary) $(UCD_DIR)/auxiliary/GraphemeBreakTest.txt
81 changes: 72 additions & 9 deletions unicode/_grapheme_cursor.pony
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,15 @@
// GB9b — Prepend ×
// GB11 — emoji ZWJ sequence: Extended_Pictographic Extend* ZWJ ×
// Extended_Pictographic
// GB9c — Indic_Conjunct_Break (Unicode 15.1):
// InCB=Consonant [InCB=Extend InCB=Linker]* InCB=Linker
// [InCB=Extend InCB=Linker]* × InCB=Consonant
// Suppresses the break before a Consonant when the
// history since the last cluster start contains a
// Consonant followed by at least one Linker, with only
// Extend/Linker codepoints between.
// GB12, GB13 — Regional_Indicator pairs
// GB999 — otherwise: break
//
// Not yet implemented:
//
// GB9c — Indic conjunct breaks (Unicode 15.1+). Requires the
// InCB property table, which lands in a later M1
// generator. Until then, text using ZWJ-linked Indic
// conjuncts (Devanagari, Bengali, etc.) may produce
// extra break points within an intended visual unit.

class ref _GraphemeCursor
let _bytes: String box
Expand All @@ -36,6 +35,8 @@ class ref _GraphemeCursor
// (Extend|ZWJ)* on the current cluster?
var _saw_zwj: Bool // last codepoint in the cluster was ZWJ
// (set after E_Pict Extend*)
var _incb_consonant_seen: Bool // GB9c: seen InCB=Consonant in cluster
var _incb_linker_seen: Bool // GB9c: seen InCB=Linker after consonant
var _emitted_first: Bool // has the first cluster been emitted yet?

new ref create(bytes: String box) =>
Expand All @@ -46,6 +47,8 @@ class ref _GraphemeCursor
_ri_run = 0
_in_emoji_seq = false
_saw_zwj = false
_incb_consonant_seen = false
_incb_linker_seen = false
_emitted_first = false

fun ref next_range(): ((USize, USize) | None) =>
Expand All @@ -67,14 +70,24 @@ class ref _GraphemeCursor
_start = _pos
_pos = _pos + cp0_len
let g0 = _UcdGraphemeBreak.of(cp0)
let incb0 = _UcdIndicConjunctBreak.of(cp0)
_reset_state(g0)
_reset_incb(incb0)

// Extend the cluster as long as GB rules say "no break."
while _pos < size do
(let cp1, let cp1_len) =
try _decode(_pos)? else (U32(0xFFFD), USize(1)) end
let g1 = _UcdGraphemeBreak.of(cp1)
if _break_between(_prev_break, g1) then
let incb1 = _UcdIndicConjunctBreak.of(cp1)
var should_break = _break_between(_prev_break, g1)
// GB9c override: suppress an otherwise-required break when the
// current cluster contains a Consonant + Linker history and the
// next codepoint is also a Consonant.
if should_break and _gb9c_applies(incb1) then
should_break = false
end
if should_break then
// Boundary before cp1 — leave it for the next call.
let range = (_start, _pos)
_prev_break = g1
Expand All @@ -84,6 +97,7 @@ class ref _GraphemeCursor
end
// No break: extend the cluster.
_advance_state(g1)
_advance_incb(incb1)
_pos = _pos + cp1_len
end
// Reached end of input — emit the final cluster.
Expand All @@ -106,6 +120,55 @@ class ref _GraphemeCursor
else 0
end

fun ref _reset_incb(incb: IndicConjunctBreak) =>
"""
Initialize InCB tracking when a new cluster's first codepoint is
`incb`. Only a Consonant primes the GB9c state machine; any other
starting value leaves both flags off.
"""
match incb
| InCBConsonant =>
_incb_consonant_seen = true
_incb_linker_seen = false
else
_incb_consonant_seen = false
_incb_linker_seen = false
end

fun ref _advance_incb(incb: IndicConjunctBreak) =>
"""
Update InCB tracking when a codepoint joins the current cluster.
Transitions (state = (consonant_seen, linker_seen)):
Consonant: → (true, false) — a new Consonant restarts the search
Linker : if consonant_seen, → (true, true)
Extend : keep state
None : break the chain → (false, false)
"""
match incb
| InCBConsonant =>
_incb_consonant_seen = true
_incb_linker_seen = false
| InCBLinker =>
if _incb_consonant_seen then _incb_linker_seen = true end
| InCBExtend =>
None
| InCBNone =>
_incb_consonant_seen = false
_incb_linker_seen = false
end

fun _gb9c_applies(next: IndicConjunctBreak): Bool =>
"""
True iff GB9c says to suppress the break before `next`. Active
only when next is Consonant and the current cluster's history
includes Consonant + ... + Linker (Extends and additional
Linkers allowed between).
"""
match next
| InCBConsonant => _incb_consonant_seen and _incb_linker_seen
else false
end

fun ref _advance_state(g: GraphemeBreak) =>
"""
Update tracking state after extending the current cluster with a
Expand Down
39 changes: 39 additions & 0 deletions unicode/_ucd_indic_conjunct_break.pony
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
// Auto-generated by unicode-build. Do not edit by hand.
// Regenerate via:
// make ucd-generate
//
// Source: DerivedCoreProperties.txt InCB entries.
// Codepoints outside any range default to InCBNone.

primitive _UcdIndicConjunctBreak
fun of(cp: U32): IndicConjunctBreak =>
let t = _table()
var lo: USize = 0
var hi: USize = t.size() / 18
while lo < hi do
let mid = lo + ((hi - lo) / 2)
let base = mid * 18
try
let range_lo: U32 =
_UcdHex.byte(t, base)?
or (_UcdHex.byte(t, base + 2)? << 8)
or (_UcdHex.byte(t, base + 4)? << 16)
or (_UcdHex.byte(t, base + 6)? << 24)
let range_hi: U32 =
_UcdHex.byte(t, base + 8)?
or (_UcdHex.byte(t, base + 10)? << 8)
or (_UcdHex.byte(t, base + 12)? << 16)
or (_UcdHex.byte(t, base + 14)? << 24)
if cp < range_lo then hi = mid
elseif cp > range_hi then lo = mid + 1
else
return IndicConjunctBreaks._from_byte(
U8.from[U32](_UcdHex.byte(t, base + 16)?))
end
else return InCBNone
end
end
InCBNone

fun _table(): String val =>
"000300006F0300000383040000890400000391050000BD05000003BF050000BF05000003C1050000C205000003C4050000C505000003C7050000C705000003100600001A060000034B0600005F06000003700600007006000003D6060000DC06000003DF060000E406000003E7060000E806000003EA060000ED06000003110700001107000003300700004A07000003A6070000B007000003EB070000F307000003FD070000FD070000031608000019080000031B0800002308000003250800002708000003290800002D08000003590800005B08000003970800009F08000003CA080000E108000003E308000002090000031509000039090000013A0900003A090000033C0900003C090000034109000048090000034D0900004D09000002510900005709000003580900005F09000001620900006309000003780900007F0900000181090000810900000395090000A809000001AA090000B009000001B2090000B209000001B6090000B909000001BC090000BC09000003BE090000BE09000003C1090000C409000003CD090000CD09000002D7090000D709000003DC090000DD09000001DF090000DF09000001E2090000E309000003F0090000F109000001FE090000FE09000003010A0000020A0000033C0A00003C0A000003410A0000420A000003470A0000480A0000034B0A00004D0A000003510A0000510A000003700A0000710A000003750A0000750A000003810A0000820A000003950A0000A80A000001AA0A0000B00A000001B20A0000B30A000001B50A0000B90A000001BC0A0000BC0A000003C10A0000C50A000003C70A0000C80A000003CD0A0000CD0A000002E20A0000E30A000003F90A0000F90A000001FA0A0000FF0A000003010B0000010B000003150B0000280B0000012A0B0000300B000001320B0000330B000001350B0000390B0000013C0B00003C0B0000033E0B00003F0B000003410B0000440B0000034D0B00004D0B000002550B0000570B0000035C0B00005D0B0000015F0B00005F0B000001620B0000630B000003710B0000710B000001820B0000820B000003BE0B0000BE0B000003C00B0000C00B000003CD0B0000CD0B000003D70B0000D70B000003000C0000000C000003040C0000040C000003150C0000280C0000012A0C0000390C0000013C0C00003C0C0000033E0C0000400C000003460C0000480C0000034A0C00004C0C0000034D0C00004D0C000002550C0000560C000003580C00005A0C000001620C0000630C000003810C0000810C000003BC0C0000BC0C000003BF0C0000C00C000003C20C0000C20C000003C60C0000C80C000003CA0C0000CD0C000003D50C0000D60C000003E20C0000E30C000003000D0000010D000003150D00003A0D0000013B0D00003C0D0000033E0D00003E0D000003410D0000440D0000034D0D00004D0D000002570D0000570D000003620D0000630D000003810D0000810D000003CA0D0000CA0D000003CF0D0000CF0D000003D20D0000D40D000003D60D0000D60D000003DF0D0000DF0D000003310E0000310E000003340E00003A0E000003470E00004E0E000003B10E0000B10E000003B40E0000BC0E000003C80E0000CE0E000003180F0000190F000003350F0000350F000003370F0000370F000003390F0000390F000003710F00007E0F000003800F0000840F000003860F0000870F0000038D0F0000970F000003990F0000BC0F000003C60F0000C60F0000032D1000003010000003321000003710000003391000003A100000033D1000003E100000035810000059100000035E10000060100000037110000074100000038210000082100000038510000086100000038D1000008D100000039D1000009D100000035D1300005F13000003121700001517000003321700003417000003521700005317000003721700007317000003B4170000B517000003B7170000BD17000003C6170000C617000003C9170000D317000003DD170000DD170000030B1800000D180000030F1800000F18000003851800008618000003A9180000A918000003201900002219000003271900002819000003321900003219000003391900003B19000003171A0000181A0000031B1A00001B1A000003561A0000561A000003581A00005E1A000003601A0000601A000003621A0000621A000003651A00006C1A000003731A00007C1A0000037F1A00007F1A000003B01A0000CE1A000003001B0000031B000003341B00003D1B000003421B0000441B0000036B1B0000731B000003801B0000811B000003A21B0000A51B000003A81B0000AD1B000003E61B0000E61B000003E81B0000E91B000003ED1B0000ED1B000003EF1B0000F31B0000032C1C0000331C000003361C0000371C000003D01C0000D21C000003D41C0000E01C000003E21C0000E81C000003ED1C0000ED1C000003F41C0000F41C000003F81C0000F91C000003C01D0000FF1D0000030D2000000D20000003D0200000F020000003EF2C0000F12C0000037F2D00007F2D000003E02D0000FF2D0000032A3000002F30000003993000009A300000036FA6000072A600000374A600007DA60000039EA600009FA6000003F0A60000F1A600000302A8000002A800000306A8000006A80000030BA800000BA800000325A8000026A80000032CA800002CA8000003C4A80000C5A8000003E0A80000F1A8000003FFA80000FFA800000326A900002DA900000347A9000051A900000353A9000053A900000380A9000082A9000003B3A90000B3A9000003B6A90000B9A9000003BCA90000BDA9000003C0A90000C0A9000003E5A90000E5A900000329AA00002EAA00000331AA000032AA00000335AA000036AA00000343AA000043AA0000034CAA00004CAA0000037CAA00007CAA000003B0AA0000B0AA000003B2AA0000B4AA000003B7AA0000B8AA000003BEAA0000BFAA000003C1AA0000C1AA000003ECAA0000EDAA000003F6AA0000F6AA000003E5AB0000E5AB000003E8AB0000E8AB000003EDAB0000EDAB0000031EFB00001EFB00000300FE00000FFE00000320FE00002FFE0000039EFF00009FFF000003FD010100FD01010003E0020100E002010003760301007A03010003010A0100030A010003050A0100060A0100030C0A01000F0A010003380A01003A0A0100033F0A01003F0A010003E50A0100E60A010003240D0100270D010003690D01006D0D010003AB0E0100AC0E010003FC0E0100FF0E010003460F0100500F010003820F0100850F0100030110010001100100033810010046100100037010010070100100037310010074100100037F1001008110010003B3100100B610010003B9100100BA10010003C2100100C210010003001101000211010003271101002B110100032D1101003411010003731101007311010003801101008111010003B6110100BE11010003C0110100C011010003C9110100CC11010003CF110100CF110100032F12010031120100033412010037120100033E1201003E12010003411201004112010003DF120100DF12010003E3120100EA120100030013010001130100033B1301003C130100033E1301003E130100034013010040130100034D1301004D13010003571301005713010003661301006C13010003701301007413010003B8130100B813010003BB130100C013010003C2130100C213010003C5130100C513010003C7130100C913010003CE130100D013010003D2130100D213010003E1130100E213010003381401003F140100034214010044140100034614010046140100035E1401005E14010003B0140100B014010003B3140100B814010003BA140100BA14010003BD140100BD14010003BF140100C014010003C2140100C314010003AF150100AF15010003B2150100B515010003BC150100BD15010003BF150100C015010003DC150100DD15010003331601003A160100033D1601003D160100033F1601004016010003AB160100AB16010003AD160100AD16010003B0160100B7160100031D1701001D170100031F1701001F17010003221701002517010003271701002B170100032F1801003718010003391801003A180100033019010030190100033B1901003E19010003431901004319010003D4190100D719010003DA190100DB19010003E0190100E019010003011A01000A1A010003331A0100381A0100033B1A01003E1A010003471A0100471A010003511A0100561A010003591A01005B1A0100038A1A0100961A010003981A0100991A010003301C0100361C010003381C01003D1C0100033F1C01003F1C010003921C0100A71C010003AA1C0100B01C010003B21C0100B31C010003B51C0100B61C010003311D0100361D0100033A1D01003A1D0100033C1D01003D1D0100033F1D0100451D010003471D0100471D010003901D0100911D010003951D0100951D010003971D0100971D010003F31E0100F41E010003001F0100011F010003361F01003A1F010003401F0100421F0100035A1F01005A1F0100034034010040340100034734010055340100031E61010029610100032D6101002F61010003F06A0100F46A010003306B0100366B0100034F6F01004F6F0100038F6F0100926F010003E46F0100E46F010003F06F0100F16F0100039DBC01009EBC01000300CF01002DCF01000330CF010046CF01000365D1010069D10100036DD1010072D10100037BD1010082D101000385D101008BD1010003AAD10100ADD101000342D2010044D201000300DA010036DA0100033BDA01006CDA01000375DA010075DA01000384DA010084DA0100039BDA01009FDA010003A1DA0100AFDA01000300E0010006E001000308E0010018E00100031BE0010021E001000323E0010024E001000326E001002AE00100038FE001008FE001000330E1010036E1010003AEE20100AEE2010003ECE20100EFE2010003ECE40100EFE4010003EEE50100EFE5010003D0E80100D6E801000344E901004AE9010003FBF30100FFF301000320000E007F000E000300010E00EF010E0003"
61 changes: 61 additions & 0 deletions unicode/indic_conjunct_break.pony
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
// `Indic_Conjunct_Break` property values (UAX #44, added Unicode 15.1).
//
// Used by UAX #29 rule GB9c to suppress cluster breaks within Indic
// conjuncts of the form:
//
// Consonant [Extend|Linker]* Linker [Extend|Linker]* × Consonant
//
// The four values match DerivedCoreProperties.txt's `InCB` property:
//
// None — default; codepoint isn't involved in Indic conjunct
// linking.
// Consonant — a Linking Consonant (Devanagari KA, Bengali KA, ...).
// Linker — a Conjunct Linker (DEVANAGARI SIGN VIRAMA, etc.).
// Extend — Extend-class cp valid inside a conjunct (NUKTA, ZWJ).
//
// Like the other closed unions in this package, exhaustive `match`
// over `IndicConjunctBreak` is compile-checked.

primitive InCBNone
fun code(): String val => "None"
fun string(): String val => "None"

primitive InCBConsonant
fun code(): String val => "Consonant"
fun string(): String val => "Consonant"

primitive InCBLinker
fun code(): String val => "Linker"
fun string(): String val => "Linker"

primitive InCBExtend
fun code(): String val => "Extend"
fun string(): String val => "Extend"

type IndicConjunctBreak is (InCBNone | InCBConsonant | InCBLinker | InCBExtend)

primitive IndicConjunctBreaks
fun from_iso(s: String box): (IndicConjunctBreak | None) =>
match s
| "None" => InCBNone
| "Consonant" => InCBConsonant
| "Linker" => InCBLinker
| "Extend" => InCBExtend
else None
end

fun _to_byte(b: IndicConjunctBreak): U8 =>
match b
| InCBNone => 0
| InCBConsonant => 1
| InCBLinker => 2
| InCBExtend => 3
end

fun _from_byte(b: U8): IndicConjunctBreak =>
match b
| 1 => InCBConsonant
| 2 => InCBLinker
| 3 => InCBExtend
else InCBNone
end
Loading
Loading