From c2b480260db6c96237bc8dc30849676d812b8c23 Mon Sep 17 00:00:00 2001
From: Khaled Hosny <khaled@aliftype.com>
Date: Sun, 21 Jan 2024 19:19:57 +0200
Subject: [PATCH] Allow cross-script kerning

Several implementations can apply kerning cross-script, so instead of
splitting cross-script kerning pairs, we now keep all kerning for
scripts which have common kerning pairs in one lookup and split the rest.
---
 .../featureWriters/kernFeatureWriter.py       | 254 ++++++++++--------
 1 file changed, 139 insertions(+), 115 deletions(-)

diff --git a/Lib/ufo2ft/featureWriters/kernFeatureWriter.py b/Lib/ufo2ft/featureWriters/kernFeatureWriter.py
index 0713bb8f..bdbb6c23 100644
--- a/Lib/ufo2ft/featureWriters/kernFeatureWriter.py
+++ b/Lib/ufo2ft/featureWriters/kernFeatureWriter.py
@@ -157,24 +157,26 @@ class KernFeatureWriter(BaseFeatureWriter):
       second lookup without the ignore marks flag.
     * Go through all kerning pairs and split them up by script, to put them in
       different lookups. This reduces the size of each lookup compared to
-      splitting by direction, as previously done.
+      splitting by direction, as previously done. If there are kerning pairs
+      with different scripts on each side, theese scripts are all kept together
+      to allow for cross-script kerning (in implmenetations that apply it).
+      Scripts with different direction are always split.
         * Partition the first and second side of a pair by script and emit only
           those with the same script (e.g. `a` and `b` are both "Latn", `period`
           and `period` are both "Default", but `a` and `a-cy` would mix "Latn"
-          and "Cyrl" and are dropped) or those that kern an explicit against a
-          "common" or "inherited" script, e.g. `a` and `period`.
+          and "Cyrl" and are dropped), or those with kerning across them, or
+          those that kern an explicit against a "common" or "inherited" script
+          (e.g. `a` and `period`).
         * Glyphs can have multiple scripts assigned to them (legitimately, e.g.
           U+0951 DEVANAGARI STRESS SIGN UDATTA, or for random reasons like
           having both `sub h by h.sc` and `sub Etaprosgegrammeni by h.sc;`).
           Only scripts that were determined earlier to be supported by the font
           will be considered. Usually, we will emit pairs where both sides have
-          the same script and no splitting is necessary. The only mixed script
-          pairs we emit are common or inherited (Zyyy or Zinh) against explicit
-          (e.g. Latn) scripts. A glyph can be part of both for weird reasons, so
-          we always treat any glyph with a common or inherited script as a
-          purely common (not inherited) glyph for bucketing purposes. This
-          avoids creating overlapping groups with the multi-script glyph in a
-          lookup.
+          the same script and no splitting is necessary. A glyph can be part of
+          both for weird reasons, so we always treat any glyph with a common or
+          inherited script as a purely common (not inherited) glyph for
+          bucketing purposes. This avoids creating overlapping groups with the
+          multi-script glyph in a lookup.
         * Some glyphs may have a script of Zyyy or Zinh but have a disjoint set
           of explicit scripts as their script extension. By looking only at the
           script extension, we treat many of them as being part of an explicit
@@ -291,7 +293,9 @@ def _write(self):
 
         lookupGroups = []
         for _, lookupGroup in sorted(lookups.items()):
-            lookupGroups.extend(lookupGroup.values())
+            lookupGroups.extend(
+                lkp for lkp in lookupGroup.values() if lkp not in lookupGroups
+            )
 
         # NOTE: We don't write classDefs because we literalise all classes.
         self._insert(
@@ -700,18 +704,11 @@ def _makeSplitScriptKernLookups(self, lookups, pairs, ignoreMarks=True, suffix="
         assert not side2Classes.keys() & newSide2Classes.keys()
         side2Classes.update(newSide2Classes)
 
-        for script, pairs in kerningPerScript.items():
-            scriptLookups = lookups.setdefault(script, {})
-
-            key = f"kern_{script}{suffix}"
-            lookup = scriptLookups.get(key)
-            if not lookup:
-                # For neatness:
-                lookup = self._makeKerningLookup(
-                    key.replace(COMMON_SCRIPT, COMMON_CLASS_NAME),
-                    ignoreMarks=ignoreMarks,
-                )
-                scriptLookups[key] = lookup
+        for scripts, pairs in kerningPerScript.items():
+            lookupName = f"kern_{'_'.join(scripts)}{suffix}".replace(
+                COMMON_SCRIPT, COMMON_CLASS_NAME
+            )
+            lookup = self._makeKerningLookup(lookupName, ignoreMarks=ignoreMarks)
             for pair in pairs:
                 bidiTypes = {
                     direction
@@ -726,13 +723,19 @@ def _makeSplitScriptKernLookups(self, lookups, pairs, ignoreMarks=True, suffix="
                         pair.value,
                     )
                     continue
-                scriptIsRtl = script_horizontal_direction(script, "LTR") == "RTL"
+                directions = {
+                    script_horizontal_direction(script, "LTR") for script in scripts
+                }
+                assert len(directions) == 1
+                scriptIsRtl = directions == {"RTL"}
                 # Numbers are always shaped LTR even in RTL scripts:
                 pairIsRtl = scriptIsRtl and "L" not in bidiTypes
                 rule = self._makePairPosRule(
                     pair, side1Classes, side2Classes, pairIsRtl
                 )
                 lookup.statements.append(rule)
+            for script in scripts:
+                lookups.setdefault(script, {})[lookupName] = lookup
 
         # Clean out empty lookups.
         for script, scriptLookups in list(lookups.items()):
@@ -771,7 +774,9 @@ def _registerLookups(
         isKernBlock = feature.name == "kern"
         dfltLookups: list[ast.LookupBlock] = []
         if isKernBlock and COMMON_SCRIPT in lookups:
-            dfltLookups.extend(lookups[COMMON_SCRIPT].values())
+            dfltLookups.extend(
+                lkp for lkp in lookups[COMMON_SCRIPT].values() if lkp not in dfltLookups
+            )
 
         # InDesign bugfix: register kerning lookups for all LTR scripts under DFLT
         # so that the basic composer, without a language selected, will still kern.
@@ -785,7 +790,9 @@ def _registerLookups(
                         lookupsLTR.extend(scriptLookups.values())
                     elif script_horizontal_direction(script, "LTR") == "RTL":
                         lookupsRTL.extend(scriptLookups.values())
-            dfltLookups.extend(lookupsLTR or lookupsRTL)
+            dfltLookups.extend(
+                lkp for lkp in (lookupsLTR or lookupsRTL) if lkp not in dfltLookups
+            )
 
         if dfltLookups:
             languages = feaLanguagesByScript.get("DFLT", ["dflt"])
@@ -814,25 +821,34 @@ def _registerLookups(
                     feature.statements.append(ast.Comment(""))
                 # We have something for this script. First add the default
                 # lookups, then the script-specific ones
-                lookupsForThisScript = []
+                lookupsForThisScript = {}
                 for dfltScript in DFLT_SCRIPTS:
                     if dfltScript in lookups:
-                        lookupsForThisScript.extend(lookups[dfltScript].values())
-                lookupsForThisScript.extend(lookups[script].values())
+                        lookupsForThisScript.update(lookups[dfltScript])
+                lookupsForThisScript.update(lookups[script])
                 # Register the lookups for all languages defined in the feature
                 # file for the script, otherwise kerning is not applied if any
                 # language is set at all.
                 languages = feaLanguagesByScript.get(tag, ["dflt"])
-                ast.addLookupReferences(feature, lookupsForThisScript, tag, languages)
+                ast.addLookupReferences(
+                    feature, lookupsForThisScript.values(), tag, languages
+                )
 
 
 def splitKerning(pairs, glyphScripts):
     # Split kerning into per-script buckets, so we can post-process them before
-    # continuing.
+    # continuing. Scripts that have cross-script kerning pairs will be put in
+    # the same bucket.
     kerningPerScript = {}
     for pair in pairs:
-        for script, splitPair in partitionByScript(pair, glyphScripts):
-            kerningPerScript.setdefault(script, []).append(splitPair)
+        for scripts, splitPair in partitionByScript(pair, glyphScripts):
+            scripts = tuple(sorted(scripts))
+            kerningPerScript.setdefault(scripts, []).append(splitPair)
+
+    # Merge buckets that have common scripts. If we have [A, B], [B, C], and
+    # [D] buckets, we want to merge the first two into [A, B, C] and leave [D]
+    # so that all kerning pairs of the three scripts are in the same lookup.
+    kerningPerScript = mergeScripts(kerningPerScript)
 
     for pairs in kerningPerScript.values():
         pairs.sort()
@@ -847,70 +863,113 @@ def partitionByScript(
     """Split a potentially mixed-script pair into pairs that make sense based
     on the dominant script, and yield each combination with its dominant script."""
 
-    side1Scripts: dict[str, set[str]] = {}
-    side2Scripts: dict[str, set[str]] = {}
+    def script_direction(script: str) -> str:
+        if script == COMMON_SCRIPT:
+            return "Auto"
+        return script_horizontal_direction(script, "LTR")
+
+    side1Directions: dict[str, set[str]] = {}
+    side2Directions: dict[str, set[str]] = {}
+    resolvedScripts: dict[str, set[str]] = {}
     for glyph in pair.firstGlyphs:
-        scripts = glyphScripts.get(glyph, DFLT_SCRIPTS)
-        # If a glyph is both common or inherited *and* another script, treat it
-        # as just common (throwing Zyyy and Zinh into the same bucket for
-        # simplicity). This ensures that a pair appears to the shaper exactly
-        # once, as long as every script sees at most 2 lookups (or 3 with mark
-        # lookups, but they contain distinct pairs), the common one and the
-        # script-specific one.
-        if scripts & DFLT_SCRIPTS:
-            scripts = COMMON_SCRIPTS_SET
-        for script in scripts:
-            side1Scripts.setdefault(script, set()).add(glyph)
+        side1Scripts = glyphScripts.get(glyph, DFLT_SCRIPTS)
+        # Treat Inherited as Common.
+        if side1Scripts & DFLT_SCRIPTS:
+            side1Scripts = COMMON_SCRIPTS_SET
+        resolvedScripts[glyph] = side1Scripts
+        side1Scripts = sorted(side1Scripts)
+        directions = [script_direction(script) for script in side1Scripts]
+        for direction in directions:
+            side1Directions.setdefault(direction, set()).add(glyph)
     for glyph in pair.secondGlyphs:
-        scripts = glyphScripts.get(glyph, DFLT_SCRIPTS)
-        if scripts & DFLT_SCRIPTS:
-            scripts = COMMON_SCRIPTS_SET
-        for script in scripts:
-            side2Scripts.setdefault(script, set()).add(glyph)
-
-    for firstScript, secondScript in itertools.product(side1Scripts, side2Scripts):
-        # Preserve the type (glyph or class) of each side.
-        localGlyphs: set[str] = set()
+        side1Scripts = glyphScripts.get(glyph, DFLT_SCRIPTS)
+        # Treat Inherited as Common.
+        if side1Scripts & DFLT_SCRIPTS:
+            side1Scripts = COMMON_SCRIPTS_SET
+        resolvedScripts[glyph] = side1Scripts
+        side1Scripts = sorted(side1Scripts)
+        directions = [script_direction(script) for script in side1Scripts]
+        for direction in directions:
+            side2Directions.setdefault(direction, set()).add(glyph)
+
+    for side1Direction, side2Direction in itertools.product(
+        side1Directions, side2Directions
+    ):
         localSide1: str | tuple[str, ...]
         localSide2: str | tuple[str, ...]
+        side1Scripts: set[str] = set()
+        side2Scripts: set[str] = set()
         if pair.firstIsClass:
-            localSide1 = tuple(sorted(side1Scripts[firstScript]))
-            localGlyphs.update(localSide1)
+            localSide1 = tuple(sorted(side1Directions[side1Direction]))
+            for glyph in localSide1:
+                side1Scripts |= resolvedScripts[glyph]
         else:
-            assert len(side1Scripts[firstScript]) == 1
-            (localSide1,) = side1Scripts[firstScript]
-            localGlyphs.add(localSide1)
+            assert len(side1Directions[side1Direction]) == 1
+            (localSide1,) = side1Directions[side1Direction]
+            side1Scripts |= resolvedScripts[localSide1]
         if pair.secondIsClass:
-            localSide2 = tuple(sorted(side2Scripts[secondScript]))
-            localGlyphs.update(localSide2)
-        else:
-            assert len(side2Scripts[secondScript]) == 1
-            (localSide2,) = side2Scripts[secondScript]
-            localGlyphs.add(localSide2)
-
-        if firstScript == secondScript or secondScript == COMMON_SCRIPT:
-            localScript = firstScript
-        elif firstScript == COMMON_SCRIPT:
-            localScript = secondScript
-        # Two different explicit scripts:
+            localSide2 = tuple(sorted(side2Directions[side2Direction]))
+            for glyph in localSide2:
+                side2Scripts |= resolvedScripts[glyph]
         else:
+            assert len(side2Directions[side2Direction]) == 1
+            (localSide2,) = side2Directions[side2Direction]
+            side2Scripts |= resolvedScripts[localSide2]
+
+        # Skip pairs with mixed direction.
+        if side1Direction != side2Direction and not any(
+            side == "Auto" for side in (side1Direction, side2Direction)
+        ):
             LOGGER.info(
-                "Skipping kerning pair <%s %s %s> with mixed script (%s, %s)",
+                "Skipping kerning pair <%s %s %s> with mixed direction (%s, %s)",
                 pair.side1,
                 pair.side2,
                 pair.value,
-                firstScript,
-                secondScript,
+                side1Direction,
+                side2Direction,
             )
             continue
 
-        yield localScript, KerningPair(
+        scripts = side1Scripts | side2Scripts
+        # If only one side has Common, drop it
+        if not all(side & COMMON_SCRIPTS_SET for side in (side1Scripts, side2Scripts)):
+            scripts -= COMMON_SCRIPTS_SET
+
+        yield scripts, KerningPair(
             localSide1,
             localSide2,
             pair.value,
         )
 
 
+def mergeScripts(kerningPerScript):
+    # https://stackoverflow.com/a/9112588
+    sets = [set(lst) for lst in kerningPerScript if lst]
+    merged = True
+    while merged:
+        merged = False
+        result = []
+        while sets:
+            common, rest = sets[0], sets[1:]
+            sets = []
+            for x in rest:
+                if x.isdisjoint(common):
+                    sets.append(x)
+                else:
+                    merged = True
+                    common |= x
+            result.append(common)
+        sets = result
+
+    result = {tuple(sorted(scripts)): [] for scripts in sets}
+    for scripts, pairs in kerningPerScript.items():
+        for scripts2 in sets:
+            if scripts2 & set(scripts):
+                result[tuple(sorted(scripts2))].extend(pairs)
+                break
+    return result
+
+
 def makeAllGlyphClassDefinitions(kerningPerScript, context, feaFile=None):
     # Note: Refer to the context for existing classDefs and mappings of glyph
     # class tuples to feaLib AST to avoid overwriting existing class names,
@@ -929,44 +988,9 @@ def makeAllGlyphClassDefinitions(kerningPerScript, context, feaFile=None):
         classNames = set()
     classNames.update(context.kerning.classDefs.keys())
 
-    # Generate common class names first so that common classes are correctly
-    # named in other lookups.
-    if COMMON_SCRIPT in kerningPerScript:
-        common_pairs = kerningPerScript[COMMON_SCRIPT]
-        for pair in common_pairs:
-            if (
-                pair.firstIsClass
-                and pair.side1 not in existingSide1Classes
-                and pair.side1 not in newSide1Classes
-            ):
-                addClassDefinition(
-                    "kern1",
-                    pair.side1,
-                    newSide1Classes,
-                    side1Membership,
-                    newClassDefs,
-                    classNames,
-                    COMMON_CLASS_NAME,
-                )
-            if (
-                pair.secondIsClass
-                and pair.side2 not in existingSide2Classes
-                and pair.side2 not in newSide2Classes
-            ):
-                addClassDefinition(
-                    "kern2",
-                    pair.side2,
-                    newSide2Classes,
-                    side2Membership,
-                    newClassDefs,
-                    classNames,
-                    COMMON_CLASS_NAME,
-                )
-
     sortedKerningPerScript = sorted(kerningPerScript.items())
-    for script, pairs in sortedKerningPerScript:
-        if script == COMMON_SCRIPT:
-            continue
+    for scripts, pairs in sortedKerningPerScript:
+        name = "_".join(scripts).replace(COMMON_SCRIPT, COMMON_CLASS_NAME)
         for pair in pairs:
             if (
                 pair.firstIsClass
@@ -980,7 +1004,7 @@ def makeAllGlyphClassDefinitions(kerningPerScript, context, feaFile=None):
                     side1Membership,
                     newClassDefs,
                     classNames,
-                    script,
+                    name,
                 )
             if (
                 pair.secondIsClass
@@ -994,7 +1018,7 @@ def makeAllGlyphClassDefinitions(kerningPerScript, context, feaFile=None):
                     side2Membership,
                     newClassDefs,
                     classNames,
-                    script,
+                    name,
                 )
 
     return newClassDefs, newSide1Classes, newSide2Classes