From af09f3568416e74eecee73cebeb67c83fee794d0 Mon Sep 17 00:00:00 2001
From: Colin Rofls <colin@cmyr.net>
Date: Thu, 14 Mar 2024 12:26:52 -0400
Subject: [PATCH] [kerning] cleanup and code review

---
 fontbe/src/features.rs            |   1 +
 fontbe/src/features/kern.rs       |  23 ++--
 fontbe/src/features/ot_tags.rs    | 176 ++++++++++++++++++++++++++++++
 fontbe/src/features/properties.rs | 149 ++-----------------------
 fontbe/src/orchestration.rs       |   4 +-
 5 files changed, 197 insertions(+), 156 deletions(-)
 create mode 100644 fontbe/src/features/ot_tags.rs
diff --git a/fontbe/src/features.rs b/fontbe/src/features.rs
index 8a674b512..7a41dda97 100644
--- a/fontbe/src/features.rs
+++ b/fontbe/src/features.rs
@@ -41,6 +41,7 @@ use crate::{
 
 mod kern;
 mod marks;
+mod ot_tags;
 mod properties;
 
 pub use kern::{create_gather_ir_kerning_work, create_kern_segment_work, create_kerns_work};
diff --git a/fontbe/src/features/kern.rs b/fontbe/src/features/kern.rs
index 2eadf7d92..75789bba6 100644
--- a/fontbe/src/features/kern.rs
+++ b/fontbe/src/features/kern.rs
@@ -443,8 +443,6 @@ impl KerningGatherWork {
 
         let gdef = compilation.gdef_classes;
 
-        // serves as a standin for cmap in the fonttools impl
-
         let pairs = fragments
             .iter()
             .flat_map(|frag| frag.kerns.iter())
@@ -460,18 +458,20 @@ impl KerningGatherWork {
 
     /// returns a vec of lookups (as a vec of subtables), along with a map of features -> lookups
     /// (by order in the first vec)
+    ///
+    /// this based on
+    /// <https://github.com/googlefonts/ufo2ft/blob/f6b4f42460b/Lib/ufo2ft/featureWriters/kernFeatureWriter.py#L772>
     fn assign_lookups_to_scripts(
         &self,
         lookups: HashMap<BTreeSet<UnicodeShortName>, Vec<PairPosBuilder>>,
         ast: &ParseTree,
         // one of 'kern' or 'dist'
         current_feature: Tag,
-    ) -> (Vec<Vec<PairPosBuilder>>, HashMap<FeatureKey, Vec<usize>>) {
+    ) -> (Vec<Vec<PairPosBuilder>>, BTreeMap<FeatureKey, Vec<usize>>) {
         let dflt_langs = vec![DFLT_LANG];
 
         let is_kern_feature = current_feature == KERN;
         assert!(is_kern_feature || current_feature == DIST);
-        // this is based on the _registerLookups fn, and this is where we should start typing
         let mut lookups_by_script = HashMap::new();
         let mut ordered_lookups = Vec::new();
 
@@ -492,8 +492,6 @@ impl KerningGatherWork {
             }
         }
 
-        // now implement the logic from _registerLookups in KernFeatureWriter:
-        // <https://github.com/googlefonts/ufo2ft/blob/f6b4f42460b/Lib/ufo2ft/featureWriters/kernFeatureWriter.py#L772>
         let mut default_lookups = Vec::new();
         if let Some(common_lookups) = lookups_by_script
             .get(&COMMON_SCRIPT)
@@ -504,6 +502,7 @@ impl KerningGatherWork {
         }
 
         //inDesign bugfix:
+        // <https://github.com/googlefonts/ufo2ft/blob/f6b4f42460b/Lib/ufo2ft/featureWriters/kernFeatureWriter.py#L785>
         let dist_enabled_scripts = super::properties::dist_feature_enabled_scripts();
         let (mut ltr_lookups, mut rtl_lookups) = (Vec::new(), Vec::new());
         for (script, lookups) in lookups_by_script
@@ -525,10 +524,9 @@ impl KerningGatherWork {
         }
         default_lookups.sort_unstable();
 
-        let mut features = HashMap::new();
+        let mut features = BTreeMap::new();
         if !default_lookups.is_empty() {
             let languages = fea_langs_by_script.get(&DFLT_SCRIPT).unwrap_or(&dflt_langs);
-            log::debug!("DFLT languages: {languages:?}");
             for lang in languages {
                 features.insert(
                     FeatureKey::new(KERN, *lang, DFLT_SCRIPT),
@@ -552,11 +550,12 @@ impl KerningGatherWork {
 
         for (script, mut lookups) in lookups_by_script {
             lookups.extend(dflt_lookups.iter().copied());
+            lookups.sort_unstable();
+            lookups.dedup();
 
             for tag in super::properties::script_to_ot_tags(&script) {
                 let languages = fea_langs_by_script.get(&tag).unwrap_or(&dflt_langs);
                 for lang in languages {
-                    log::info!("added {} lookups for {tag}/{lang}", lookups.len());
                     features.insert(FeatureKey::new(KERN, *lang, tag), lookups.clone());
                 }
             }
@@ -568,19 +567,19 @@ impl KerningGatherWork {
 }
 
 fn debug_ordered_lookups(
-    features: &HashMap<FeatureKey, Vec<usize>>,
+    features: &BTreeMap<FeatureKey, Vec<usize>>,
     lookups: &[Vec<PairPosBuilder>],
 ) {
     for (i, subtables) in lookups.iter().enumerate() {
         let total_rules = subtables.iter().map(|x| x.len()).sum::<usize>();
-        log::debug!("lookup {i}, {total_rules} rules");
+        log::trace!("lookup {i}, {total_rules} rules");
     }
 
     let mut feature_keys = features.keys().collect::<Vec<_>>();
     feature_keys.sort();
     for feature in feature_keys {
         let indices = features.get(feature).unwrap();
-        log::debug!("feature {feature:?}, lookups {indices:?}");
+        log::trace!("feature {feature:?}, lookups {indices:?}");
     }
 }
 
diff --git a/fontbe/src/features/ot_tags.rs b/fontbe/src/features/ot_tags.rs
new file mode 100644
index 000000000..c8c768237
--- /dev/null
+++ b/fontbe/src/features/ot_tags.rs
@@ -0,0 +1,176 @@
+//! mapping opentype tags to unicode scripts
+//!
+//! based on
+//! <https://github.com/fonttools/fonttools/blob/8697f91cdc/Lib/fontTools/unicodedata/OTTags.py>
+
+use write_fonts::types::Tag;
+
+pub(crate) const DFLT_SCRIPT: Tag = Tag::new(b"DFLT");
+
+pub(crate) static SCRIPT_ALIASES: &[(Tag, Tag)] = &[(Tag::new(b"jamo"), Tag::new(b"hang"))];
+
+pub(crate) static SCRIPT_EXCEPTIONS: &[(&str, Tag)] = &[
+    ("Hira", Tag::new(b"kana")),
+    ("Hrkt", Tag::new(b"kana")),
+    ("Laoo", Tag::new(b"lao ")),
+    ("Nkoo", Tag::new(b"nko ")),
+    ("Vaii", Tag::new(b"vai ")),
+    ("Yiii", Tag::new(b"yi  ")),
+    ("Zinh", DFLT_SCRIPT),
+    ("Zmth", Tag::new(b"math")),
+    ("Zyyy", DFLT_SCRIPT),
+    ("Zzzz", DFLT_SCRIPT),
+];
+
+// 'math' is used as a script in opentype features:
+// <https://github.com/harfbuzz/harfbuzz/pull/3417>
+pub(crate) static SCRIPT_EXCEPTIONS_REVERSED: &[(Tag, &str)] = &[(Tag::new(b"math"), "Zmth")];
+
+pub(crate) static NEW_SCRIPTS: &[(Tag, &str)] = &[
+    (Tag::new(b"bng2"), "Beng"),
+    (Tag::new(b"dev2"), "Deva"),
+    (Tag::new(b"gjr2"), "Gujr"),
+    (Tag::new(b"gur2"), "Guru"),
+    (Tag::new(b"knd2"), "Knda"),
+    (Tag::new(b"mlm2"), "Mlym"),
+    (Tag::new(b"mym2"), "Mymr"),
+    (Tag::new(b"ory2"), "Orya"),
+    (Tag::new(b"tel2"), "Telu"),
+    (Tag::new(b"tml2"), "Taml"),
+];
+
+pub(crate) static NEW_SCRIPT_TAGS: &[(&str, Tag)] = &[
+    ("Beng", Tag::new(b"bng2")),
+    ("Deva", Tag::new(b"dev2")),
+    ("Gujr", Tag::new(b"gjr2")),
+    ("Guru", Tag::new(b"gur2")),
+    ("Kana", Tag::new(b"knd2")),
+    ("Mlym", Tag::new(b"mlm2")),
+    ("Mymr", Tag::new(b"mym2")),
+    ("Orya", Tag::new(b"ory2")),
+    ("Taml", Tag::new(b"tml2")),
+    ("Telu", Tag::new(b"tel2")),
+];
+
+pub(crate) static INDIC_SCRIPTS: &[&str] = &[
+    "Beng", // Bengali
+    "Deva", // Devanagari
+    "Gujr", // Gujarati
+    "Guru", // Gurmukhi
+    "Knda", // Kannada
+    "Mlym", // Malayalam
+    "Orya", // Oriya
+    "Sinh", // Sinhala
+    "Taml", // Tamil
+    "Telu", // Telugu
+];
+
+pub(crate) static USE_SCRIPTS: &[&str] = &[
+    // Correct as at Unicode 15.0
+    "Adlm", // Adlam
+    "Ahom", // Ahom
+    "Bali", // Balinese
+    "Batk", // Batak
+    "Brah", // Brahmi
+    "Bugi", // Buginese
+    "Buhd", // Buhid
+    "Cakm", // Chakma
+    "Cham", // Cham
+    "Chrs", // Chorasmian
+    "Cpmn", // Cypro Minoan
+    "Diak", // Dives Akuru
+    "Dogr", // Dogra
+    "Dupl", // Duployan
+    "Egyp", // Egyptian Hieroglyphs
+    "Elym", // Elymaic
+    "Gong", // Gunjala Gondi
+    "Gonm", // Masaram Gondi
+    "Gran", // Grantha
+    "Hano", // Hanunoo
+    "Hmng", // Pahawh Hmong
+    "Hmnp", // Nyiakeng Puachue Hmong
+    "Java", // Javanese
+    "Kali", // Kayah Li
+    "Kawi", // Kawi
+    "Khar", // Kharosthi
+    "Khoj", // Khojki
+    "Kits", // Khitan Small Script
+    "Kthi", // Kaithi
+    "Lana", // Tai Tham
+    "Lepc", // Lepcha
+    "Limb", // Limbu
+    "Mahj", // Mahajani
+    "Maka", // Makasar
+    "Mand", // Mandaic
+    "Mani", // Manichaean
+    "Marc", // Marchen
+    "Medf", // Medefaidrin
+    "Modi", // Modi
+    "Mong", // Mongolian
+    "Mtei", // Meetei Mayek
+    "Mult", // Multani
+    "Nagm", // Nag Mundari
+    "Nand", // Nandinagari
+    "Newa", // Newa
+    "Nhks", // Bhaiksuki
+    "Nko ", // Nko
+    "Ougr", // Old Uyghur
+    "Phag", // Phags Pa
+    "Phlp", // Psalter Pahlavi
+    "Plrd", // Miao
+    "Rjng", // Rejang
+    "Rohg", // Hanifi Rohingya
+    "Saur", // Saurashtra
+    "Shrd", // Sharada
+    "Sidd", // Siddham
+    "Sind", // Khudawadi
+    "Sogd", // Sogdian
+    "Sogo", // Old Sogdian
+    "Soyo", // Soyombo
+    "Sund", // Sundanese
+    "Sylo", // Syloti Nagri
+    "Tagb", // Tagbanwa
+    "Takr", // Takri
+    "Tale", // Tai Le
+    "Tavt", // Tai Viet
+    "Tfng", // Tifinagh
+    "Tglg", // Tagalog
+    "Tibt", // Tibetan
+    "Tirh", // Tirhuta
+    "Tnsa", // Tangsa
+    "Toto", // Toto
+    "Vith", // Vithkuqi
+    "Wcho", // Wancho
+    "Yezi", // Yezidi
+    "Zanb", // Zanabazar Square
+];
+
+#[cfg(test)]
+mod tests {
+    use super::*;
+
+    /// we want to binary search these, so let's enforce that they are sorted,
+    /// to avoid future headaches
+    #[test]
+    fn const_arrays_are_sorted() {
+        fn get_original_and_sorted_items<T: Clone + Ord + Eq, U>(
+            items: &[(T, U)],
+        ) -> (Vec<T>, Vec<T>) {
+            let originals = items.iter().map(|(a, _)| a.clone()).collect::<Vec<_>>();
+            let mut sorted = originals.clone();
+            sorted.sort();
+            (originals, sorted)
+        }
+
+        let (actual, expected) = get_original_and_sorted_items(SCRIPT_ALIASES);
+        assert_eq!(actual, expected);
+        let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS_REVERSED);
+        assert_eq!(actual, expected);
+        let (actual, expected) = get_original_and_sorted_items(NEW_SCRIPTS);
+        assert_eq!(actual, expected);
+        let (actual, expected) = get_original_and_sorted_items(NEW_SCRIPT_TAGS);
+        assert_eq!(actual, expected);
+        let (actual, expected) = get_original_and_sorted_items(SCRIPT_EXCEPTIONS);
+        assert_eq!(actual, expected);
+    }
+}
diff --git a/fontbe/src/features/properties.rs b/fontbe/src/features/properties.rs
index db06611ec..bb4d21239 100644
--- a/fontbe/src/features/properties.rs
+++ b/fontbe/src/features/properties.rs
@@ -12,6 +12,10 @@ use write_fonts::{
     types::{GlyphId, Tag},
 };
 
+use crate::features::ot_tags::{NEW_SCRIPTS, SCRIPT_ALIASES, SCRIPT_EXCEPTIONS_REVERSED};
+
+use super::ot_tags::{DFLT_SCRIPT, INDIC_SCRIPTS, NEW_SCRIPT_TAGS, SCRIPT_EXCEPTIONS, USE_SCRIPTS};
+
 // SAFETY: we can easily verify that neither of these strings contains a nul byte.
 // (this is the only way we can declare these as constants)
 pub const COMMON_SCRIPT: UnicodeShortName =
@@ -56,8 +60,11 @@ impl CharMap for Vec<(Arc<Glyph>, GlyphId)> {
 
 impl ScriptDirection {
     /// Returns the writing direction for the provided script
+    // <https://github.com/googlefonts/ufo2ft/blob/f6b4f42460b340c/Lib/ufo2ft/featureWriters/kernFeatureWriter.py#L63>
     pub(crate) fn for_script(script: &UnicodeShortName) -> Self {
         match script.as_str() {
+            // this list from
+            // <https://github.com/fonttools/fonttools/blob/8697f91cdc/Lib/fontTools/unicodedata/__init__.py#L141>
             "Zyyy" => ScriptDirection::Auto,
             "Arab" | "Hebr" | "Syrc" | "Thaa" | "Cprt" | "Khar" | "Phnx" | "Nkoo" | "Lydi"
             | "Avst" | "Armi" | "Phli" | "Prti" | "Sarb" | "Orkh" | "Samr" | "Mand" | "Merc"
@@ -200,148 +207,6 @@ pub(crate) fn glyphs_by_bidi_class(
     )
 }
 
-const DFLT_SCRIPT: Tag = Tag::new(b"DFLT");
-
-static SCRIPT_ALIASES: &[(Tag, Tag)] = &[(Tag::new(b"jamo"), Tag::new(b"hang"))];
-
-static SCRIPT_EXCEPTIONS: &[(&str, Tag)] = &[
-    ("Hira", Tag::new(b"kana")),
-    ("Hrkt", Tag::new(b"kana")),
-    ("Laoo", Tag::new(b"lao ")),
-    ("Nkoo", Tag::new(b"nko ")),
-    ("Vaii", Tag::new(b"vai ")),
-    ("Yiii", Tag::new(b"yi  ")),
-    ("Zinh", DFLT_SCRIPT),
-    ("Zmth", Tag::new(b"math")),
-    ("Zyyy", DFLT_SCRIPT),
-    ("Zzzz", DFLT_SCRIPT),
-];
-
-// 'math' is used as a script in opentype features:
-// <https://github.com/harfbuzz/harfbuzz/pull/3417>
-static SCRIPT_EXCEPTIONS_REVERSED: &[(Tag, &str)] = &[(Tag::new(b"math"), "Zmth")];
-
-// I don't know what's going on here, just copying OTTags.py
-static NEW_SCRIPTS: &[(Tag, &str)] = &[
-    (Tag::new(b"bng2"), "Beng"),
-    (Tag::new(b"dev2"), "Deva"),
-    (Tag::new(b"gjr2"), "Gujr"),
-    (Tag::new(b"gur2"), "Guru"),
-    (Tag::new(b"knd2"), "Knda"),
-    (Tag::new(b"mlm2"), "Mlym"),
-    (Tag::new(b"mym2"), "Mymr"),
-    (Tag::new(b"ory2"), "Orya"),
-    (Tag::new(b"tel2"), "Telu"),
-    (Tag::new(b"tml2"), "Taml"),
-];
-
-// I don't know what's going on here, just copying OTTags.py
-static NEW_SCRIPT_TAGS: &[(&str, Tag)] = &[
-    ("Beng", Tag::new(b"bng2")),
-    ("Deva", Tag::new(b"dev2")),
-    ("Gujr", Tag::new(b"gjr2")),
-    ("Guru", Tag::new(b"gur2")),
-    ("Kana", Tag::new(b"knd2")),
-    ("Mlym", Tag::new(b"mlm2")),
-    ("Mymr", Tag::new(b"mym2")),
-    ("Orya", Tag::new(b"ory2")),
-    ("Taml", Tag::new(b"tml2")),
-    ("Telu", Tag::new(b"tel2")),
-];
-
-static INDIC_SCRIPTS: &[&str] = &[
-    "Beng", // Bengali
-    "Deva", // Devanagari
-    "Gujr", // Gujarati
-    "Guru", // Gurmukhi
-    "Knda", // Kannada
-    "Mlym", // Malayalam
-    "Orya", // Oriya
-    "Sinh", // Sinhala
-    "Taml", // Tamil
-    "Telu", // Telugu
-];
-
-static USE_SCRIPTS: &[&str] = &[
-    // Correct as at Unicode 15.0
-    "Adlm", // Adlam
-    "Ahom", // Ahom
-    "Bali", // Balinese
-    "Batk", // Batak
-    "Brah", // Brahmi
-    "Bugi", // Buginese
-    "Buhd", // Buhid
-    "Cakm", // Chakma
-    "Cham", // Cham
-    "Chrs", // Chorasmian
-    "Cpmn", // Cypro Minoan
-    "Diak", // Dives Akuru
-    "Dogr", // Dogra
-    "Dupl", // Duployan
-    "Egyp", // Egyptian Hieroglyphs
-    "Elym", // Elymaic
-    "Gong", // Gunjala Gondi
-    "Gonm", // Masaram Gondi
-    "Gran", // Grantha
-    "Hano", // Hanunoo
-    "Hmng", // Pahawh Hmong
-    "Hmnp", // Nyiakeng Puachue Hmong
-    "Java", // Javanese
-    "Kali", // Kayah Li
-    "Kawi", // Kawi
-    "Khar", // Kharosthi
-    "Khoj", // Khojki
-    "Kits", // Khitan Small Script
-    "Kthi", // Kaithi
-    "Lana", // Tai Tham
-    "Lepc", // Lepcha
-    "Limb", // Limbu
-    "Mahj", // Mahajani
-    "Maka", // Makasar
-    "Mand", // Mandaic
-    "Mani", // Manichaean
-    "Marc", // Marchen
-    "Medf", // Medefaidrin
-    "Modi", // Modi
-    "Mong", // Mongolian
-    "Mtei", // Meetei Mayek
-    "Mult", // Multani
-    "Nagm", // Nag Mundari
-    "Nand", // Nandinagari
-    "Newa", // Newa
-    "Nhks", // Bhaiksuki
-    "Nko ", // Nko
-    "Ougr", // Old Uyghur
-    "Phag", // Phags Pa
-    "Phlp", // Psalter Pahlavi
-    "Plrd", // Miao
-    "Rjng", // Rejang
-    "Rohg", // Hanifi Rohingya
-    "Saur", // Saurashtra
-    "Shrd", // Sharada
-    "Sidd", // Siddham
-    "Sind", // Khudawadi
-    "Sogd", // Sogdian
-    "Sogo", // Old Sogdian
-    "Soyo", // Soyombo
-    "Sund", // Sundanese
-    "Sylo", // Syloti Nagri
-    "Tagb", // Tagbanwa
-    "Takr", // Takri
-    "Tale", // Tai Le
-    "Tavt", // Tai Viet
-    "Tfng", // Tifinagh
-    "Tglg", // Tagalog
-    "Tibt", // Tibetan
-    "Tirh", // Tirhuta
-    "Tnsa", // Tangsa
-    "Toto", // Toto
-    "Vith", // Vithkuqi
-    "Wcho", // Wancho
-    "Yezi", // Yezidi
-    "Zanb", // Zanabazar Square
-];
-
 pub(crate) fn dist_feature_enabled_scripts() -> HashSet<UnicodeShortName> {
     INDIC_SCRIPTS
         .iter()
diff --git a/fontbe/src/orchestration.rs b/fontbe/src/orchestration.rs
index da189842f..a92d2d441 100644
--- a/fontbe/src/orchestration.rs
+++ b/fontbe/src/orchestration.rs
@@ -1,7 +1,7 @@
 //! Helps coordinate the graph execution for BE
 
 use std::{
-    collections::{BTreeMap, HashMap, HashSet},
+    collections::{BTreeMap, HashSet},
     fmt::Display,
     fs::File,
     io::{self, BufReader, BufWriter, Read, Write},
@@ -377,7 +377,7 @@ pub struct FeaRsKerns {
     /// ordered!
     pub lookups: Vec<Vec<PairPosBuilder>>,
     /// each value is a set of lookups, referenced by their order in array above
-    pub features: HashMap<FeatureKey, Vec<usize>>,
+    pub features: BTreeMap<FeatureKey, Vec<usize>>,
 }
 
 /// The abstract syntax tree of any user FEA.