From ba42df687a86114d5dfaee43514d83016c7befc3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Ju=CC=88rg=20Lehni?= <juerg@scratchdisk.com>
Date: Mon, 15 Jun 2026 22:21:12 +0200
Subject: [PATCH] Compose canonical decompositions (NFC) in the default shaper

- Compose each base + combining-mark cluster into the font's precomposed glyph via font-aware NFC before GSUB, matching HarfBuzz: decomposed input (i + U+0300, or Arabic alef + fathatan + hamza-above) shapes to the precomposed glyph instead of separate marks
- Only apply the result when it changes the glyph count (composition or its decompose fallback); leave pure canonical reordering alone so it can't disturb downstream GSUB (e.g. Arabic shadda + vowel calt)
- Decompose back any composed codepoint the font has no glyph for, so its marks stay available for GPOS mark positioning
- Scope to the default shaper and the Arabic/Hebrew/Thai shapers that inherit it; Indic/Hangul/Universal keep their own composition
- Add 6 canonical-composition shaping tests (FiraSans Latin, Amiri Arabic incl. the reorder regression)
---
 src/opentype/shapers/DefaultShaper.js | 73 ++++++++++++++++++++++++++-
 test/shaping.js                       | 50 ++++++++++++++++++
 2 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/src/opentype/shapers/DefaultShaper.js b/src/opentype/shapers/DefaultShaper.js
index e02e4d36..19652604 100644
--- a/src/opentype/shapers/DefaultShaper.js
+++ b/src/opentype/shapers/DefaultShaper.js
@@ -1,4 +1,5 @@
-import {isDigit} from 'unicode-properties';
+import {isDigit, isMark} from 'unicode-properties';
+import GlyphInfo from '../GlyphInfo';
 
 const VARIATION_FEATURES = ['rvrn'];
 const COMMON_FEATURES = ['ccmp', 'locl', 'rlig', 'mark', 'mkmk'];
@@ -42,6 +43,13 @@ export default class DefaultShaper {
   }
 
   static assignFeatures(plan, glyphs) {
+    // Apply Unicode canonical composition (NFC) before GSUB, matching what
+    // HarfBuzz does for non-complex scripts: when the font has a precomposed
+    // glyph for a base + combining-mark sequence, use it. Decomposed input
+    // (e.g. "i" + U+0300) otherwise shapes as separate glyphs ([i, gravecomb])
+    // instead of the precomposed glyph the font intends ([igrave]).
+    composeGlyphs(plan.font, glyphs);
+
     // Enable contextual fractions
     for (let i = 0; i < glyphs.length; i++) {
       let glyph = glyphs[i];
@@ -70,3 +78,66 @@ export default class DefaultShaper {
     }
   }
 }
+
+// Apply Unicode canonical composition (NFC) to each base + combining-mark
+// cluster, mirroring HarfBuzz's normalization before GSUB/GPOS: the marks are
+// reordered by combining class and composed onto the base when the font has a
+// precomposed glyph for the result. Without this, decomposed input shapes as
+// separate glyphs (e.g. "i" + U+0300 -> [i, gravecomb], or Arabic alef +
+// fathatan + hamza-above -> [alef, fathatan, hamza]) instead of the precomposed
+// glyph ([igrave]; [alef-with-hamza, fathatan]) HarfBuzz and browsers produce.
+function composeGlyphs(font, glyphs) {
+  let i = 0;
+  while (i < glyphs.length) {
+    // A cluster starts at a non-mark base glyph...
+    if (glyphs[i].codePoints.length !== 1 || isMark(glyphs[i].codePoints[0])) {
+      i++;
+      continue;
+    }
+    // ...and extends across the combining marks that follow it.
+    let end = i + 1;
+    while (
+      end < glyphs.length &&
+      glyphs[end].codePoints.length === 1 &&
+      isMark(glyphs[end].codePoints[0])
+    ) {
+      end++;
+    }
+    let input = [];
+    for (let j = i; j < end; j++) input.push(glyphs[j].codePoints[0]);
+    let composed = composeCodePoints(font, input);
+    // Only rebuild when composition (or its decompose fallback) actually changed
+    // the glyph count. NFC also canonically reorders marks, but applying a pure
+    // reorder isn't needed to reach a precomposed glyph and would disturb the
+    // order downstream GSUB expects (e.g. Arabic shadda + vowel calt lookups).
+    if (composed.length !== input.length) {
+      // The base's features are global at this stage, so the rebuilt cluster
+      // (precomposed base + any leftover marks) inherits them uniformly.
+      let features = glyphs[i].features;
+      let replacement = composed.map(
+        cp => new GlyphInfo(font, font.glyphForCodePoint(cp).id, [cp], features)
+      );
+      glyphs.splice(i, end - i, ...replacement);
+      i += replacement.length;
+    } else {
+      i = end;
+    }
+  }
+}
+
+// Font-aware Unicode canonical composition for one cluster's codepoints: NFC
+// reorders the combining marks by combining class and composes them, then any
+// resulting codepoint the font can't render is decomposed again (NFD) so its
+// marks stay separate for GPOS mark positioning — exactly HarfBuzz's behaviour.
+function composeCodePoints(font, codePoints) {
+  let result = [];
+  for (let char of String.fromCodePoint(...codePoints).normalize('NFC')) {
+    let cp = char.codePointAt(0);
+    if (font.hasGlyphForCodePoint(cp)) {
+      result.push(cp);
+    } else {
+      for (let part of char.normalize('NFD')) result.push(part.codePointAt(0));
+    }
+  }
+  return result;
+}
diff --git a/test/shaping.js b/test/shaping.js
index dc005ea0..70e871f2 100644
--- a/test/shaping.js
+++ b/test/shaping.js
@@ -582,4 +582,54 @@ describe('shaping', function () {
       test('SHBALI-2/12', 'NotoSans/NotoSansBalinese-Regular.ttf', "ᬓ᭄ᭅᬸ", '23+2275|162+0|60@0,-1000+0');
     });
   });
+
+  describe('canonical composition (NFC)', function () {
+    // HarfBuzz composes a base + combining-mark sequence into the font's
+    // precomposed glyph before GSUB/GPOS when the font has one. The default
+    // shaper does the same: decomposed input must shape identically to the
+    // precomposed character, not as a separate base + floating mark.
+    let font = fontkit.openSync(new URL('data/FiraSans/FiraSans-Regular.ttf', import.meta.url));
+    let shape = (...cps) => font.layout(String.fromCodePoint(...cps)).glyphs.map(g => g.id);
+
+    it('composes base + combining mark into the precomposed glyph', function () {
+      // "i" + U+0300 (combining grave) === precomposed U+00EC "ì" (igrave)
+      assert.deepEqual(shape(0x69, 0x300), shape(0x00EC));
+    });
+
+    it('composes a multi-mark sequence greedily', function () {
+      // "e" + U+0302 (circumflex) + U+0301 (acute) === precomposed U+1EBF "ế"
+      assert.deepEqual(shape(0x65, 0x302, 0x301), shape(0x1EBF));
+    });
+
+    it('leaves a sequence decomposed when the font has no precomposed glyph', function () {
+      // No precomposed "b-grave" exists, so the mark must stay separate.
+      assert.equal(shape(0x62, 0x300).length, 2);
+    });
+
+    it('does not interfere with GSUB ligatures', function () {
+      // "office": the fi ligature must still form (composition is mark-only).
+      assert.ok(shape(0x6F, 0x66, 0x66, 0x69, 0x63, 0x65).length < 6);
+    });
+
+    it('reorders and composes Arabic marks across combining classes', function () {
+      let amiri = fontkit.openSync(new URL('data/amiri/amiri-regular.ttf', import.meta.url));
+      let shapeAr = (...cps) => amiri.layout(String.fromCodePoint(...cps)).glyphs.map(g => g.id);
+      // alef + combining hamza-above === precomposed U+0623 (alef with hamza).
+      assert.deepEqual(shapeAr(0x627, 0x654), shapeAr(0x623));
+      // alef + fathatan + hamza-above: the hamza (ccc 230) composes onto the
+      // alef across the lower-class fathatan (ccc 27) per canonical reordering,
+      // leaving the fathatan — i.e. identical to precomposed U+0623 + fathatan.
+      assert.deepEqual(shapeAr(0x627, 0x64b, 0x654), shapeAr(0x623, 0x64b));
+    });
+
+    it('leaves a non-composing mark cluster unreordered for GSUB', function () {
+      // alef + shadda + fathatan don't compose, so the cluster must stay in its
+      // original order: canonically reordering shadda (ccc 33) after fathatan
+      // (ccc 27) would break Amiri's calt, which keys on shadda-before-vowel and
+      // yields fathatan's small variant. Applying a pure NFC reorder regresses it.
+      let amiri = fontkit.openSync(new URL('data/amiri/amiri-regular.ttf', import.meta.url));
+      let { glyphs } = amiri.layout(String.fromCodePoint(0x627, 0x651, 0x64b), { calt: true }, undefined, 'ARA ');
+      assert.deepEqual(glyphs.map(g => g.name), ['uni064B.small', 'uni0651', 'uni0627']);
+    });
+  });
 });