diff --git a/Lib/ufo2ft/featureWriters/kernFeatureWriter.py b/Lib/ufo2ft/featureWriters/kernFeatureWriter.py index 0713bb8f..524c4ddd 100644 --- a/Lib/ufo2ft/featureWriters/kernFeatureWriter.py +++ b/Lib/ufo2ft/featureWriters/kernFeatureWriter.py @@ -60,6 +60,12 @@ def unicodeBidiType(uv): return None +def script_direction(script: str) -> str: + if script == COMMON_SCRIPT: + return "Auto" + return script_horizontal_direction(script, "LTR") + + @dataclass(frozen=True, order=False) class KerningPair: __slots__ = ("side1", "side2", "value") @@ -157,24 +163,26 @@ class KernFeatureWriter(BaseFeatureWriter): second lookup without the ignore marks flag. * Go through all kerning pairs and split them up by script, to put them in different lookups. This reduces the size of each lookup compared to - splitting by direction, as previously done. + splitting by direction, as previously done. If there are kerning pairs + with different scripts on each side, these scripts are all kept together + to allow for cross-script kerning (in implementations that apply it). + Scripts with different direction are always split. * Partition the first and second side of a pair by script and emit only those with the same script (e.g. `a` and `b` are both "Latn", `period` and `period` are both "Default", but `a` and `a-cy` would mix "Latn" - and "Cyrl" and are dropped) or those that kern an explicit against a - "common" or "inherited" script, e.g. `a` and `period`. + and "Cyrl" and are dropped), or those with kerning across them, or + those that kern an explicit against a "common" or "inherited" script + (e.g. `a` and `period`). * Glyphs can have multiple scripts assigned to them (legitimately, e.g. U+0951 DEVANAGARI STRESS SIGN UDATTA, or for random reasons like having both `sub h by h.sc` and `sub Etaprosgegrammeni by h.sc;`). Only scripts that were determined earlier to be supported by the font will be considered. Usually, we will emit pairs where both sides have - the same script and no splitting is necessary. The only mixed script - pairs we emit are common or inherited (Zyyy or Zinh) against explicit - (e.g. Latn) scripts. A glyph can be part of both for weird reasons, so - we always treat any glyph with a common or inherited script as a - purely common (not inherited) glyph for bucketing purposes. This - avoids creating overlapping groups with the multi-script glyph in a - lookup. + the same script and no splitting is necessary. A glyph can be part of + both for weird reasons, so we always treat any glyph with a common or + inherited script as a purely common (not inherited) glyph for + bucketing purposes. This avoids creating overlapping groups with the + multi-script glyph in a lookup. * Some glyphs may have a script of Zyyy or Zinh but have a disjoint set of explicit scripts as their script extension. By looking only at the script extension, we treat many of them as being part of an explicit @@ -291,7 +299,9 @@ def _write(self): lookupGroups = [] for _, lookupGroup in sorted(lookups.items()): - lookupGroups.extend(lookupGroup.values()) + lookupGroups.extend( + lkp for lkp in lookupGroup.values() if lkp not in lookupGroups + ) # NOTE: We don't write classDefs because we literalise all classes. self._insert( @@ -700,18 +710,11 @@ def _makeSplitScriptKernLookups(self, lookups, pairs, ignoreMarks=True, suffix=" assert not side2Classes.keys() & newSide2Classes.keys() side2Classes.update(newSide2Classes) - for script, pairs in kerningPerScript.items(): - scriptLookups = lookups.setdefault(script, {}) - - key = f"kern_{script}{suffix}" - lookup = scriptLookups.get(key) - if not lookup: - # For neatness: - lookup = self._makeKerningLookup( - key.replace(COMMON_SCRIPT, COMMON_CLASS_NAME), - ignoreMarks=ignoreMarks, - ) - scriptLookups[key] = lookup + for scripts, pairs in kerningPerScript.items(): + lookupName = f"kern_{'_'.join(scripts)}{suffix}".replace( + COMMON_SCRIPT, COMMON_CLASS_NAME + ) + lookup = self._makeKerningLookup(lookupName, ignoreMarks=ignoreMarks) for pair in pairs: bidiTypes = { direction @@ -726,13 +729,17 @@ def _makeSplitScriptKernLookups(self, lookups, pairs, ignoreMarks=True, suffix=" pair.value, ) continue - scriptIsRtl = script_horizontal_direction(script, "LTR") == "RTL" + directions = {script_direction(script) for script in scripts} + assert len(directions) == 1 + scriptIsRtl = directions == {"RTL"} # Numbers are always shaped LTR even in RTL scripts: pairIsRtl = scriptIsRtl and "L" not in bidiTypes rule = self._makePairPosRule( pair, side1Classes, side2Classes, pairIsRtl ) lookup.statements.append(rule) + for script in scripts: + lookups.setdefault(script, {})[lookupName] = lookup # Clean out empty lookups. for script, scriptLookups in list(lookups.items()): @@ -771,7 +778,9 @@ def _registerLookups( isKernBlock = feature.name == "kern" dfltLookups: list[ast.LookupBlock] = [] if isKernBlock and COMMON_SCRIPT in lookups: - dfltLookups.extend(lookups[COMMON_SCRIPT].values()) + dfltLookups.extend( + lkp for lkp in lookups[COMMON_SCRIPT].values() if lkp not in dfltLookups + ) # InDesign bugfix: register kerning lookups for all LTR scripts under DFLT # so that the basic composer, without a language selected, will still kern. @@ -780,12 +789,14 @@ def _registerLookups( lookupsLTR: list[ast.LookupBlock] = [] lookupsRTL: list[ast.LookupBlock] = [] for script, scriptLookups in sorted(lookups.items()): - if script != COMMON_SCRIPT and script not in DIST_ENABLED_SCRIPTS: - if script_horizontal_direction(script, "LTR") == "LTR": + if script not in DIST_ENABLED_SCRIPTS: + if script_direction(script) == "LTR": lookupsLTR.extend(scriptLookups.values()) - elif script_horizontal_direction(script, "LTR") == "RTL": + elif script_direction(script) == "RTL": lookupsRTL.extend(scriptLookups.values()) - dfltLookups.extend(lookupsLTR or lookupsRTL) + dfltLookups.extend( + lkp for lkp in (lookupsLTR or lookupsRTL) if lkp not in dfltLookups + ) if dfltLookups: languages = feaLanguagesByScript.get("DFLT", ["dflt"]) @@ -814,27 +825,38 @@ def _registerLookups( feature.statements.append(ast.Comment("")) # We have something for this script. First add the default # lookups, then the script-specific ones - lookupsForThisScript = [] + lookupsForThisScript = {} for dfltScript in DFLT_SCRIPTS: if dfltScript in lookups: - lookupsForThisScript.extend(lookups[dfltScript].values()) - lookupsForThisScript.extend(lookups[script].values()) + lookupsForThisScript.update(lookups[dfltScript]) + lookupsForThisScript.update(lookups[script]) # Register the lookups for all languages defined in the feature # file for the script, otherwise kerning is not applied if any # language is set at all. languages = feaLanguagesByScript.get(tag, ["dflt"]) - ast.addLookupReferences(feature, lookupsForThisScript, tag, languages) + ast.addLookupReferences( + feature, lookupsForThisScript.values(), tag, languages + ) def splitKerning(pairs, glyphScripts): # Split kerning into per-script buckets, so we can post-process them before - # continuing. + # continuing. Scripts that have cross-script kerning pairs will be put in + # the same bucket. kerningPerScript = {} for pair in pairs: - for script, splitPair in partitionByScript(pair, glyphScripts): - kerningPerScript.setdefault(script, []).append(splitPair) + for scripts, splitPair in partitionByScript(pair, glyphScripts): + scripts = tuple(sorted(scripts)) + kerningPerScript.setdefault(scripts, []).append(splitPair) - for pairs in kerningPerScript.values(): + kerningPerScript = mergeScripts(kerningPerScript) + + for scripts, pairs in kerningPerScript.items(): + if len(scripts) > 1: + LOGGER.info( + "Merging kerning lookups from the following scripts: %s", + ", ".join(scripts), + ) pairs.sort() return kerningPerScript @@ -847,8 +869,9 @@ def partitionByScript( """Split a potentially mixed-script pair into pairs that make sense based on the dominant script, and yield each combination with its dominant script.""" - side1Scripts: dict[str, set[str]] = {} - side2Scripts: dict[str, set[str]] = {} + side1Directions: dict[str, set[str]] = {} + side2Directions: dict[str, set[str]] = {} + resolvedScripts: dict[str, set[str]] = {} for glyph in pair.firstGlyphs: scripts = glyphScripts.get(glyph, DFLT_SCRIPTS) # If a glyph is both common or inherited *and* another script, treat it @@ -859,58 +882,102 @@ def partitionByScript( # script-specific one. if scripts & DFLT_SCRIPTS: scripts = COMMON_SCRIPTS_SET - for script in scripts: - side1Scripts.setdefault(script, set()).add(glyph) + resolvedScripts[glyph] = scripts + for direction in (script_direction(script) for script in sorted(scripts)): + side1Directions.setdefault(direction, set()).add(glyph) for glyph in pair.secondGlyphs: scripts = glyphScripts.get(glyph, DFLT_SCRIPTS) if scripts & DFLT_SCRIPTS: scripts = COMMON_SCRIPTS_SET - for script in scripts: - side2Scripts.setdefault(script, set()).add(glyph) + resolvedScripts[glyph] = scripts + for direction in (script_direction(script) for script in sorted(scripts)): + side2Directions.setdefault(direction, set()).add(glyph) - for firstScript, secondScript in itertools.product(side1Scripts, side2Scripts): - # Preserve the type (glyph or class) of each side. - localGlyphs: set[str] = set() + for side1Direction, side2Direction in itertools.product( + side1Directions, side2Directions + ): localSide1: str | tuple[str, ...] localSide2: str | tuple[str, ...] + side1Scripts: set[str] = set() + side2Scripts: set[str] = set() if pair.firstIsClass: - localSide1 = tuple(sorted(side1Scripts[firstScript])) - localGlyphs.update(localSide1) + localSide1 = tuple(sorted(side1Directions[side1Direction])) + for glyph in localSide1: + side1Scripts |= resolvedScripts[glyph] else: - assert len(side1Scripts[firstScript]) == 1 - (localSide1,) = side1Scripts[firstScript] - localGlyphs.add(localSide1) + assert len(side1Directions[side1Direction]) == 1 + (localSide1,) = side1Directions[side1Direction] + side1Scripts |= resolvedScripts[localSide1] if pair.secondIsClass: - localSide2 = tuple(sorted(side2Scripts[secondScript])) - localGlyphs.update(localSide2) - else: - assert len(side2Scripts[secondScript]) == 1 - (localSide2,) = side2Scripts[secondScript] - localGlyphs.add(localSide2) - - if firstScript == secondScript or secondScript == COMMON_SCRIPT: - localScript = firstScript - elif firstScript == COMMON_SCRIPT: - localScript = secondScript - # Two different explicit scripts: + localSide2 = tuple(sorted(side2Directions[side2Direction])) + for glyph in localSide2: + side2Scripts |= resolvedScripts[glyph] else: + assert len(side2Directions[side2Direction]) == 1 + (localSide2,) = side2Directions[side2Direction] + side2Scripts |= resolvedScripts[localSide2] + + # Skip pairs with mixed direction. + if side1Direction != side2Direction and not any( + side == "Auto" for side in (side1Direction, side2Direction) + ): LOGGER.info( - "Skipping kerning pair <%s %s %s> with mixed script (%s, %s)", + "Skipping kerning pair <%s %s %s> with mixed direction (%s, %s)", pair.side1, pair.side2, pair.value, - firstScript, - secondScript, + side1Direction, + side2Direction, ) continue - yield localScript, KerningPair( + scripts = side1Scripts | side2Scripts + # If only one side has Common, drop it + if not all(side & COMMON_SCRIPTS_SET for side in (side1Scripts, side2Scripts)): + scripts -= COMMON_SCRIPTS_SET + + yield scripts, KerningPair( localSide1, localSide2, pair.value, ) +def mergeScripts(kerningPerScript): + """Merge buckets that have common scripts. If we have [A, B], [B, C], and + [D] buckets, we want to merge the first two into [A, B, C] and leave [D] so + that all kerning pairs of the three scripts are in the same lookup.""" + sets = [set(scripts) for scripts in kerningPerScript if scripts] + merged = True + while merged: + merged = False + result = [] + while sets: + common, rest = sets[0], sets[1:] + sets = [] + for scripts in rest: + if scripts.isdisjoint(common): + sets.append(scripts) + else: + merged = True + common |= scripts + result.append(common) + sets = result + + # Now that we have merged all common-script buckets, we need to re-assign + # the kerning pairs to the new buckets. + result = {tuple(sorted(scripts)): [] for scripts in sets} + for scripts, pairs in kerningPerScript.items(): + for scripts2 in sets: + if scripts2 & set(scripts): + result[tuple(sorted(scripts2))].extend(pairs) + break + else: + # Shouldn't happen, but just in case. + raise AssertionError + return result + + def makeAllGlyphClassDefinitions(kerningPerScript, context, feaFile=None): # Note: Refer to the context for existing classDefs and mappings of glyph # class tuples to feaLib AST to avoid overwriting existing class names, @@ -931,9 +998,10 @@ def makeAllGlyphClassDefinitions(kerningPerScript, context, feaFile=None): # Generate common class names first so that common classes are correctly # named in other lookups. - if COMMON_SCRIPT in kerningPerScript: - common_pairs = kerningPerScript[COMMON_SCRIPT] - for pair in common_pairs: + for scripts, pairs in kerningPerScript.items(): + if set(scripts) != COMMON_SCRIPTS_SET: + continue + for pair in pairs: if ( pair.firstIsClass and pair.side1 not in existingSide1Classes @@ -964,9 +1032,10 @@ def makeAllGlyphClassDefinitions(kerningPerScript, context, feaFile=None): ) sortedKerningPerScript = sorted(kerningPerScript.items()) - for script, pairs in sortedKerningPerScript: - if script == COMMON_SCRIPT: + for scripts, pairs in sortedKerningPerScript: + if set(scripts) == COMMON_SCRIPTS_SET: continue + script = "_".join(scripts).replace(COMMON_SCRIPT, COMMON_CLASS_NAME) for pair in pairs: if ( pair.firstIsClass diff --git a/tests/featureWriters/kernFeatureWriter_test.py b/tests/featureWriters/kernFeatureWriter_test.py index 9653c3bb..a4caa618 100644 --- a/tests/featureWriters/kernFeatureWriter_test.py +++ b/tests/featureWriters/kernFeatureWriter_test.py @@ -595,29 +595,23 @@ def test_arabic_numerals(self, FontClass): assert dedent(str(generated)) == dedent( """\ - lookup kern_Arab { - lookupflag IgnoreMarks; - pos four-ar seven-ar -30; - } kern_Arab; - - lookup kern_Thaa { + lookup kern_Arab_Thaa { lookupflag IgnoreMarks; pos four-ar seven-ar -30; - } kern_Thaa; + } kern_Arab_Thaa; feature kern { script DFLT; language dflt; - lookup kern_Arab; - lookup kern_Thaa; + lookup kern_Arab_Thaa; script arab; language dflt; - lookup kern_Arab; + lookup kern_Arab_Thaa; script thaa; language dflt; - lookup kern_Thaa; + lookup kern_Arab_Thaa; } kern; """ ) @@ -1496,74 +1490,55 @@ def test_kern_split_and_drop(FontClass, caplog): assert dedent(str(newFeatures)) == dedent( """\ - @kern1.Grek.bar = [period]; - @kern1.Grek.foo = [alpha]; - @kern1.Latn.foo = [a]; - @kern1.Orya.foo = [a-orya]; - @kern2.Grek.bar = [period]; - @kern2.Grek.foo = [alpha]; - @kern2.Latn.foo = [a]; - @kern2.Orya.foo = [a-orya]; - - lookup kern_Grek { - lookupflag IgnoreMarks; - pos @kern1.Grek.foo @kern2.Grek.bar 20; - pos @kern1.Grek.bar @kern2.Grek.foo 20; - } kern_Grek; - - lookup kern_Latn { + @kern1.Cyrl_Grek_Latn_Orya.bar = [a-cy]; + @kern1.Cyrl_Grek_Latn_Orya.bar_1 = [period]; + @kern1.Cyrl_Grek_Latn_Orya.foo = [a a-orya alpha]; + @kern2.Cyrl_Grek_Latn_Orya.bar = [a-cy]; + @kern2.Cyrl_Grek_Latn_Orya.bar_1 = [period]; + @kern2.Cyrl_Grek_Latn_Orya.foo = [a a-orya alpha]; + + lookup kern_Cyrl_Grek_Latn_Orya { lookupflag IgnoreMarks; - pos @kern1.Latn.foo @kern2.Grek.bar 20; - pos @kern1.Grek.bar @kern2.Latn.foo 20; - } kern_Latn; - - lookup kern_Orya { - lookupflag IgnoreMarks; - pos @kern1.Orya.foo @kern2.Grek.bar 20; - pos @kern1.Grek.bar @kern2.Orya.foo 20; - } kern_Orya; + pos @kern1.Cyrl_Grek_Latn_Orya.foo @kern2.Cyrl_Grek_Latn_Orya.bar 20; + pos @kern1.Cyrl_Grek_Latn_Orya.foo @kern2.Cyrl_Grek_Latn_Orya.bar_1 20; + pos @kern1.Cyrl_Grek_Latn_Orya.bar @kern2.Cyrl_Grek_Latn_Orya.foo 20; + pos @kern1.Cyrl_Grek_Latn_Orya.bar_1 @kern2.Cyrl_Grek_Latn_Orya.foo 20; + } kern_Cyrl_Grek_Latn_Orya; feature kern { script DFLT; language dflt; - lookup kern_Grek; - lookup kern_Latn; + lookup kern_Cyrl_Grek_Latn_Orya; + + script cyrl; + language dflt; + lookup kern_Cyrl_Grek_Latn_Orya; script grek; language dflt; - lookup kern_Grek; + lookup kern_Cyrl_Grek_Latn_Orya; script latn; language dflt; - lookup kern_Latn; + lookup kern_Cyrl_Grek_Latn_Orya; } kern; feature dist { script ory2; language dflt; - lookup kern_Orya; + lookup kern_Cyrl_Grek_Latn_Orya; script orya; language dflt; - lookup kern_Orya; + lookup kern_Cyrl_Grek_Latn_Orya; } dist; """ ) - msgs = sorted(msg[-30:] for msg in caplog.messages) - assert msgs == [ - "with mixed script (Arab, Grek)", - "with mixed script (Arab, Latn)", - "with mixed script (Arab, Orya)", - "with mixed script (Cyrl, Grek)", - "with mixed script (Cyrl, Latn)", - "with mixed script (Cyrl, Orya)", - "with mixed script (Grek, Arab)", - "with mixed script (Grek, Cyrl)", - "with mixed script (Latn, Arab)", - "with mixed script (Latn, Cyrl)", - "with mixed script (Orya, Arab)", - "with mixed script (Orya, Cyrl)", + assert caplog.messages == [ + "Skipping kerning pair <('a', 'a-orya', 'alpha') ('a-cy', 'alef-ar', 'period') 20> with mixed direction (LTR, RTL)", + "Skipping kerning pair <('a-cy', 'alef-ar', 'period') ('a', 'a-orya', 'alpha') 20> with mixed direction (RTL, LTR)", + "Merging kerning lookups from the following scripts: Cyrl, Grek, Latn, Orya", ] @@ -1598,7 +1573,7 @@ def test_kern_split_and_drop_mixed(caplog, FontClass): """ ) assert ( - "Skipping kerning pair <('V', 'W') ('W', 'gba-nko') -20> with mixed script (Latn, Nkoo)" + "Skipping kerning pair <('V', 'W') ('W', 'gba-nko') -20> with mixed direction (LTR, RTL)" in caplog.text ) @@ -1681,33 +1656,26 @@ def test_kern_multi_script(FontClass): assert dedent(str(newFeatures)) == dedent( """\ - @kern1.Arab.foo = [lam-ar]; - @kern1.Nkoo.foo = [gba-nko]; - @kern2.Arab.foo = [comma-ar]; - - lookup kern_Arab { - lookupflag IgnoreMarks; - pos @kern1.Arab.foo @kern2.Arab.foo <-20 0 -20 0>; - } kern_Arab; + @kern1.Arab_Nkoo.foo = [gba-nko lam-ar]; + @kern2.Arab_Nkoo.foo = [comma-ar]; - lookup kern_Nkoo { + lookup kern_Arab_Nkoo { lookupflag IgnoreMarks; - pos @kern1.Nkoo.foo @kern2.Arab.foo <-20 0 -20 0>; - } kern_Nkoo; + pos @kern1.Arab_Nkoo.foo @kern2.Arab_Nkoo.foo <-20 0 -20 0>; + } kern_Arab_Nkoo; feature kern { script DFLT; language dflt; - lookup kern_Arab; - lookup kern_Nkoo; + lookup kern_Arab_Nkoo; script arab; language dflt; - lookup kern_Arab; + lookup kern_Arab_Nkoo; script nko; language dflt; - lookup kern_Nkoo; + lookup kern_Arab_Nkoo; } kern; """ ) @@ -1837,13 +1805,14 @@ def test_kern_zyyy_zinh(FontClass): pos uni1DC0 uni1DC0 6; } kern_Grek; - lookup kern_Hani { + lookup kern_Hani_Hrkt { lookupflag IgnoreMarks; pos uni1D360 uni1D360 37; pos uni1D370 uni1D370 38; pos uni1F250 uni1F250 39; pos uni3010 uni3010 8; pos uni3030 uni3030 9; + pos uni30A0 uni30A0 10; pos uni3190 uni3190 11; pos uni31C0 uni31C0 12; pos uni31D0 uni31D0 13; @@ -1861,15 +1830,8 @@ def test_kern_zyyy_zinh(FontClass): pos uni33E0 uni33E0 25; pos uni33F0 uni33F0 26; pos uniA700 uniA700 27; - } kern_Hani; - - lookup kern_Hrkt { - lookupflag IgnoreMarks; - pos uni3010 uni3010 8; - pos uni3030 uni3030 9; - pos uni30A0 uni30A0 10; pos uniFF70 uniFF70 29; - } kern_Hrkt; + } kern_Hani_Hrkt; lookup kern_Default { lookupflag IgnoreMarks; @@ -1889,8 +1851,7 @@ def test_kern_zyyy_zinh(FontClass): language dflt; lookup kern_Default; lookup kern_Grek; - lookup kern_Hani; - lookup kern_Hrkt; + lookup kern_Hani_Hrkt; script grek; language dflt; @@ -1900,12 +1861,12 @@ def test_kern_zyyy_zinh(FontClass): script hani; language dflt; lookup kern_Default; - lookup kern_Hani; + lookup kern_Hani_Hrkt; script kana; language dflt; lookup kern_Default; - lookup kern_Hrkt; + lookup kern_Hani_Hrkt; } kern; feature dist {