From ba42df687a86114d5dfaee43514d83016c7befc3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ju=CC=88rg=20Lehni?= Date: Mon, 15 Jun 2026 22:21:12 +0200 Subject: [PATCH] Compose canonical decompositions (NFC) in the default shaper - Compose each base + combining-mark cluster into the font's precomposed glyph via font-aware NFC before GSUB, matching HarfBuzz: decomposed input (i + U+0300, or Arabic alef + fathatan + hamza-above) shapes to the precomposed glyph instead of separate marks - Only apply the result when it changes the glyph count (composition or its decompose fallback); leave pure canonical reordering alone so it can't disturb downstream GSUB (e.g. Arabic shadda + vowel calt) - Decompose back any composed codepoint the font has no glyph for, so its marks stay available for GPOS mark positioning - Scope to the default shaper and the Arabic/Hebrew/Thai shapers that inherit it; Indic/Hangul/Universal keep their own composition - Add 6 canonical-composition shaping tests (FiraSans Latin, Amiri Arabic incl. the reorder regression) --- src/opentype/shapers/DefaultShaper.js | 73 ++++++++++++++++++++++++++- test/shaping.js | 50 ++++++++++++++++++ 2 files changed, 122 insertions(+), 1 deletion(-) diff --git a/src/opentype/shapers/DefaultShaper.js b/src/opentype/shapers/DefaultShaper.js index e02e4d36..19652604 100644 --- a/src/opentype/shapers/DefaultShaper.js +++ b/src/opentype/shapers/DefaultShaper.js @@ -1,4 +1,5 @@ -import {isDigit} from 'unicode-properties'; +import {isDigit, isMark} from 'unicode-properties'; +import GlyphInfo from '../GlyphInfo'; const VARIATION_FEATURES = ['rvrn']; const COMMON_FEATURES = ['ccmp', 'locl', 'rlig', 'mark', 'mkmk']; @@ -42,6 +43,13 @@ export default class DefaultShaper { } static assignFeatures(plan, glyphs) { + // Apply Unicode canonical composition (NFC) before GSUB, matching what + // HarfBuzz does for non-complex scripts: when the font has a precomposed + // glyph for a base + combining-mark sequence, use it. Decomposed input + // (e.g. "i" + U+0300) otherwise shapes as separate glyphs ([i, gravecomb]) + // instead of the precomposed glyph the font intends ([igrave]). + composeGlyphs(plan.font, glyphs); + // Enable contextual fractions for (let i = 0; i < glyphs.length; i++) { let glyph = glyphs[i]; @@ -70,3 +78,66 @@ export default class DefaultShaper { } } } + +// Apply Unicode canonical composition (NFC) to each base + combining-mark +// cluster, mirroring HarfBuzz's normalization before GSUB/GPOS: the marks are +// reordered by combining class and composed onto the base when the font has a +// precomposed glyph for the result. Without this, decomposed input shapes as +// separate glyphs (e.g. "i" + U+0300 -> [i, gravecomb], or Arabic alef + +// fathatan + hamza-above -> [alef, fathatan, hamza]) instead of the precomposed +// glyph ([igrave]; [alef-with-hamza, fathatan]) HarfBuzz and browsers produce. +function composeGlyphs(font, glyphs) { + let i = 0; + while (i < glyphs.length) { + // A cluster starts at a non-mark base glyph... + if (glyphs[i].codePoints.length !== 1 || isMark(glyphs[i].codePoints[0])) { + i++; + continue; + } + // ...and extends across the combining marks that follow it. + let end = i + 1; + while ( + end < glyphs.length && + glyphs[end].codePoints.length === 1 && + isMark(glyphs[end].codePoints[0]) + ) { + end++; + } + let input = []; + for (let j = i; j < end; j++) input.push(glyphs[j].codePoints[0]); + let composed = composeCodePoints(font, input); + // Only rebuild when composition (or its decompose fallback) actually changed + // the glyph count. NFC also canonically reorders marks, but applying a pure + // reorder isn't needed to reach a precomposed glyph and would disturb the + // order downstream GSUB expects (e.g. Arabic shadda + vowel calt lookups). + if (composed.length !== input.length) { + // The base's features are global at this stage, so the rebuilt cluster + // (precomposed base + any leftover marks) inherits them uniformly. + let features = glyphs[i].features; + let replacement = composed.map( + cp => new GlyphInfo(font, font.glyphForCodePoint(cp).id, [cp], features) + ); + glyphs.splice(i, end - i, ...replacement); + i += replacement.length; + } else { + i = end; + } + } +} + +// Font-aware Unicode canonical composition for one cluster's codepoints: NFC +// reorders the combining marks by combining class and composes them, then any +// resulting codepoint the font can't render is decomposed again (NFD) so its +// marks stay separate for GPOS mark positioning — exactly HarfBuzz's behaviour. +function composeCodePoints(font, codePoints) { + let result = []; + for (let char of String.fromCodePoint(...codePoints).normalize('NFC')) { + let cp = char.codePointAt(0); + if (font.hasGlyphForCodePoint(cp)) { + result.push(cp); + } else { + for (let part of char.normalize('NFD')) result.push(part.codePointAt(0)); + } + } + return result; +} diff --git a/test/shaping.js b/test/shaping.js index dc005ea0..70e871f2 100644 --- a/test/shaping.js +++ b/test/shaping.js @@ -582,4 +582,54 @@ describe('shaping', function () { test('SHBALI-2/12', 'NotoSans/NotoSansBalinese-Regular.ttf', "ᬓ᭄ᭅᬸ", '23+2275|162+0|60@0,-1000+0'); }); }); + + describe('canonical composition (NFC)', function () { + // HarfBuzz composes a base + combining-mark sequence into the font's + // precomposed glyph before GSUB/GPOS when the font has one. The default + // shaper does the same: decomposed input must shape identically to the + // precomposed character, not as a separate base + floating mark. + let font = fontkit.openSync(new URL('data/FiraSans/FiraSans-Regular.ttf', import.meta.url)); + let shape = (...cps) => font.layout(String.fromCodePoint(...cps)).glyphs.map(g => g.id); + + it('composes base + combining mark into the precomposed glyph', function () { + // "i" + U+0300 (combining grave) === precomposed U+00EC "ì" (igrave) + assert.deepEqual(shape(0x69, 0x300), shape(0x00EC)); + }); + + it('composes a multi-mark sequence greedily', function () { + // "e" + U+0302 (circumflex) + U+0301 (acute) === precomposed U+1EBF "ế" + assert.deepEqual(shape(0x65, 0x302, 0x301), shape(0x1EBF)); + }); + + it('leaves a sequence decomposed when the font has no precomposed glyph', function () { + // No precomposed "b-grave" exists, so the mark must stay separate. + assert.equal(shape(0x62, 0x300).length, 2); + }); + + it('does not interfere with GSUB ligatures', function () { + // "office": the fi ligature must still form (composition is mark-only). + assert.ok(shape(0x6F, 0x66, 0x66, 0x69, 0x63, 0x65).length < 6); + }); + + it('reorders and composes Arabic marks across combining classes', function () { + let amiri = fontkit.openSync(new URL('data/amiri/amiri-regular.ttf', import.meta.url)); + let shapeAr = (...cps) => amiri.layout(String.fromCodePoint(...cps)).glyphs.map(g => g.id); + // alef + combining hamza-above === precomposed U+0623 (alef with hamza). + assert.deepEqual(shapeAr(0x627, 0x654), shapeAr(0x623)); + // alef + fathatan + hamza-above: the hamza (ccc 230) composes onto the + // alef across the lower-class fathatan (ccc 27) per canonical reordering, + // leaving the fathatan — i.e. identical to precomposed U+0623 + fathatan. + assert.deepEqual(shapeAr(0x627, 0x64b, 0x654), shapeAr(0x623, 0x64b)); + }); + + it('leaves a non-composing mark cluster unreordered for GSUB', function () { + // alef + shadda + fathatan don't compose, so the cluster must stay in its + // original order: canonically reordering shadda (ccc 33) after fathatan + // (ccc 27) would break Amiri's calt, which keys on shadda-before-vowel and + // yields fathatan's small variant. Applying a pure NFC reorder regresses it. + let amiri = fontkit.openSync(new URL('data/amiri/amiri-regular.ttf', import.meta.url)); + let { glyphs } = amiri.layout(String.fromCodePoint(0x627, 0x651, 0x64b), { calt: true }, undefined, 'ARA '); + assert.deepEqual(glyphs.map(g => g.name), ['uni064B.small', 'uni0651', 'uni0627']); + }); + }); });