Loading glyphslib derived data is showing up in profiles, try to make…

… it not do so
googlefonts · Oct 24, 2024 · 143c8d1 · 143c8d1
1 parent 081b604
commit 143c8d1
Show file tree

Hide file tree

Showing 9 changed files with 672 additions and 663 deletions.
diff --git a/glyphs-reader/Cargo.toml b/glyphs-reader/Cargo.toml
@@ -35,10 +35,3 @@ bincode.workspace = true
 [dev-dependencies]
 pretty_assertions.workspace = true
 rstest.workspace = true
-
-[build-dependencies]
-quick-xml = "0.36"
-smol_str.workspace = true
-serde.workspace = true
-thiserror.workspace = true
-bincode.workspace = true
diff --git a/glyphs-reader/build.rs b/glyphs-reader/build.rs
diff --git a/glyphs-reader/data/update.py b/glyphs-reader/data/update.py
@@ -1,43 +1,188 @@
-"""Update bundled xml files
+"""Update bundled data derived from glyphsLib GlyphData.xml and GlyphData_Ideographs.xml.
 
-We try to match the behaviour of the python toolchain, so we want to ship the
-same data files as are currently bundled in glyphsLib. This script copies those
-files out of the currently active version of glyphsLib.
+This script copies files out of the currently active version of glyphsLib and generates
+Rust code for efficient access to the default data. Override files must be loaded separately
+from XML. We only generate code for the fields we actively use.
 
 Usage:
-    python data/update.py
+    python glyphs-reader/data/update.py
 """
 
+import dataclasses
+from dataclasses import dataclass
 import glyphsLib
 from importlib import resources
-import os
-import shutil
+from io import StringIO
+from lxml import etree
+from pathlib import Path
+from textwrap import dedent
+from typing import Optional, Tuple
 
-def script_dir():
-    return os.path.dirname(os.path.abspath(__file__))
 
-def get_data_file(filepath):
-    return resources.files(glyphsLib).joinpath("data").joinpath(filepath)
+@dataclass(frozen=True)
+class GlyphInfo:
+    codepoint: Optional[int]
+    name: str
+    category: str
+    subcategory: Optional[str]
 
 
-def copy_data_files():
-    target_dir = script_dir()
-    for target in ["GlyphData.xml", "GlyphData_Ideographs.xml"]:
-        file = get_data_file(target)
-        target = os.path.join(target_dir, target)
-        with file.open("rb") as source, open(target, "wb") as dest:
-            shutil.copyfileobj(source, dest)
+def codename(name: Optional[str]) -> Optional[str]:
+    if name is None:
+        return None
+    return name.replace(" ", "")
 
-def write_version_file():
-    version = glyphsLib.__version__
-    with open(os.path.join(script_dir(), 'VERSION'), 'w') as f:
-        f.write(f"XML files copied from glyphsLib version {version}.\n"
-                "(this file generated by update.py)\n")
 
-def main(_):
-    copy_data_files()
-    write_version_file()
+def read_glyph_info(file: str) -> Tuple[GlyphInfo]:
+    file = resources.files(glyphsLib).joinpath("data").joinpath(file)
+    with open(file) as f:
+        tree = etree.parse(f)
+
+    by_name = {}
+
+    # Do a full pass to collect names
+    for e in tree.xpath("//glyph"):
+        info = GlyphInfo(
+            e.attrib.get("unicode", None),
+            e.attrib["name"],
+            codename(e.attrib["category"]),
+            codename(e.attrib.get("subCategory", None)),
+        )
+        if info.name not in by_name:
+            by_name[info.name] = info
+        else:
+            print(f"We've already seen {info.name}!")
+
+    # Then add alt_names where they don't overlap names
+    for e in tree.xpath("//glyph[@altNames]"):
+        for alt_name in e.attrib["altNames"].split(","):
+            if alt_name in by_name:
+                print(f'Ignoring alt name "{alt_name}", already taken')
+                continue
+            by_name[alt_name] = dataclasses.replace(
+                by_name[e.attrib["name"]], name=alt_name, codepoint=None
+            )
+
+    return tuple(by_name.values())
+
+
+def main():
+    glyph_infos = sorted(
+        set(read_glyph_info("GlyphData.xml"))
+        | set(read_glyph_info("GlyphData_Ideographs.xml")),
+        key=lambda g: g.name,
+    )
+    names = {g.name for g in glyph_infos}
+    categories = {g.category for g in glyph_infos}
+    subcategories = {g.subcategory for g in glyph_infos if g.subcategory is not None}
+    assert len(names) == len(glyph_infos), "Names aren't unique?"
+    codepoints = {}
+    for i, gi in enumerate(glyph_infos):
+        if gi.codepoint is None:
+            continue
+        codepoint = int(gi.codepoint, 16)
+        if codepoint not in codepoints:
+            codepoints[codepoint] = i
+        else:
+            print(
+                f"Multiple names are assigned 0x{codepoint:04x}, using the first one we saw"
+            )
+
+    dest_file = Path(__file__).parent.parent / "src" / "glyphslib_data.rs"
+
+    with open(dest_file, "w") as f:
+        f.write(
+            f"//! Glyph data generated from glyphsLib {glyphsLib.__version__} by {Path(__file__).name}\n"
+        )
+        f.write("//!\n")
+        f.write(f"//! {len(glyph_infos)} glyph metadata records taken from glyphsLib\n")
+
+        f.write(
+            dedent(
+                """
+                use std::str::FromStr;
+                use smol_str::SmolStr;
+                use crate::glyphdata::GlyphInfo;
+
+                /// The primary category for a given glyph
+                ///
+                /// Generated to ensure it matches the glyphsLib dataset.
+                ///
+                /// These categories are not the same as the unicode character categories.
+                #[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
+                #[repr(u8)]
+                pub enum Category {
+			"""
+            )
+        )
+        for category in sorted(categories):
+            f.write(f"    {category},\n")
+        f.write("}\n")
+
+        f.write("impl FromStr for Category {\n")
+        f.write("    type Err = SmolStr;\n\n")
+        f.write("    fn from_str(s: &str) -> Result<Self, Self::Err> {\n")
+        f.write("        match s {\n")
+        for category in sorted(categories):
+            f.write(f'            "{category}" => Ok(Self::{category}),\n')
+        f.write(f"            _ => Err(s.into()),\n")
+        f.write("        }\n")
+        f.write("    }\n")
+        f.write("}\n")
+        f.write("\n")
+
+        f.write(
+            dedent(
+                """
+			/// The secondary category for a given glyph
+			///
+			/// Generated to ensure it matches the glyphsLib dataset.
+			#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash, PartialOrd, Ord)]
+			#[repr(u8)]
+			pub enum Subcategory {
+			"""
+            )
+        )
+        for subcategory in sorted(subcategories):
+            f.write(f"    {subcategory},\n")
+        f.write("}\n\n")
+
+        f.write("impl FromStr for Subcategory {\n")
+        f.write("    type Err = SmolStr;\n\n")
+        f.write("    fn from_str(s: &str) -> Result<Self, Self::Err> {\n")
+        f.write("        match s {\n")
+        for subcategory in sorted(subcategories):
+            f.write(f'            "{subcategory}" => Ok(Subcategory::{subcategory}),\n')
+        f.write(f"            _ => Err(s.into()),\n")
+        f.write("        }\n")
+        f.write("    }\n")
+        f.write("}\n")
+        f.write("\n")
+
+        f.write("// Sorted by name, has unique names, therefore safe to bsearch\n")
+        f.write("pub(crate) const GLYPH_INFO: &[GlyphInfo] = &[\n")
+        for gi in glyph_infos:
+            codepoint = "None"
+            if gi.codepoint is not None:
+                codepoint = f"Some(0x{gi.codepoint})"
+            subcategory = "None"
+            if gi.subcategory is not None:
+                subcategory = f"Some(Subcategory::{gi.subcategory})"
+            f.write(
+                f'    GlyphInfo::new("{gi.name}", Category::{gi.category}, {subcategory}, {codepoint}),\n'
+            )
+
+        f.write("];\n")
+
+        f.write(
+            "// Sorted by codepoint, has unique codepoints, therefore safe to bsearch\n"
+        )
+        f.write("pub(crate) const CODEPOINT_TO_INFO_IDX: &[(u32, usize)] = &[\n")
+        for codepoint, i in sorted(codepoints.items()):
+            f.write(f"    (0x{codepoint:04x}, {i}), // {glyph_infos[i].name}\n")
+
+        f.write("];\n")
 
 
 if __name__ == "__main__":
-    main(None)
+    main()
diff --git a/glyphs-reader/src/font.rs b/glyphs-reader/src/font.rs
@@ -11,7 +11,8 @@ use std::hash::Hash;
 use std::str::FromStr;
 use std::{fs, path};
 
-use crate::glyphdata::{Category, GlyphData, Subcategory};
+use crate::glyphdata::GlyphData;
+use crate::{Category, Subcategory};
 use ascii_plist_derive::FromPlist;
 use fontdrasil::types::WidthClass;
 use kurbo::{Affine, Point, Vec2};
@@ -205,7 +206,7 @@ pub struct Glyph {
     /// The right kerning group
     pub right_kern: Option<SmolStr>,
     pub category: Option<Category>,
-    pub sub_category: Subcategory,
+    pub sub_category: Option<Subcategory>,
 }
 
 impl Glyph {
@@ -214,7 +215,7 @@ impl Glyph {
             (self.category, self.sub_category),
             (
                 Some(Category::Mark),
-                Subcategory::Nonspacing | Subcategory::SpacingCombining
+                Some(Subcategory::Nonspacing) | Some(Subcategory::SpacingCombining)
             )
         )
     }
@@ -1898,7 +1899,7 @@ impl TryFrom<RawLayer> for Layer {
 
 impl RawGlyph {
     // we pass in the radix because it depends on the version, stored in the font struct
-    fn build(self, codepoint_radix: u32) -> Result<Glyph, Error> {
+    fn build(self, codepoint_radix: u32, glyph_data: &GlyphData) -> Result<Glyph, Error> {
         let mut instances = Vec::new();
         for layer in self.layers {
             if layer.is_draft() {
@@ -1933,12 +1934,12 @@ impl RawGlyph {
             .unwrap_or_default();
 
         if category.is_none() || sub_category.is_none() {
-            if let Some((computed_category, computed_subcategory)) =
-                get_glyph_category(&self.glyphname, &codepoints)
+            if let Some((computed_category, computed_subcategory, _)) =
+                glyph_data.query(&self.glyphname, Some(&codepoints))
             {
                 // if they were manually set don't change them, otherwise do
                 category = category.or(Some(computed_category));
-                sub_category = sub_category.or(Some(computed_subcategory));
+                sub_category = sub_category.or(computed_subcategory);
             }
         }
 
@@ -1950,20 +1951,11 @@ impl RawGlyph {
             right_kern: self.kern_right,
             unicode: codepoints,
             category,
-            sub_category: sub_category.unwrap_or_default(),
+            sub_category,
         })
     }
 }
 
-// This will eventually need to be replaced with something that can handle
-// custom GlyphData.xml files, as well as handle overrides that are part of the
-// glyph source.
-fn get_glyph_category(name: &str, codepoints: &BTreeSet<u32>) -> Option<(Category, Subcategory)> {
-    GlyphData::bundled()
-        .get_glyph(name, Some(codepoints))
-        .map(|info| (info.category, info.subcategory))
-}
-
 // https://github.com/googlefonts/glyphsLib/blob/24b4d340e4c82948ba121dcfe563c1450a8e69c9/Lib/glyphsLib/builder/constants.py#L186
 #[rustfmt::skip]
 static GLYPHS_TO_OPENTYPE_LANGUAGE_ID: &[(&str, i32)] = &[
@@ -2239,6 +2231,9 @@ impl TryFrom<RawFont> for Font {
             from.v2_to_v3_names()?;
         }
 
+        // TODO: this should be provided in a manner that allows for overrides
+        let glyph_data = GlyphData::glyphs_lib_data();
+
         let radix = if from.is_v2() { 16 } else { 10 };
         let glyph_order = parse_glyph_order(&from);
 
@@ -2277,7 +2272,10 @@ impl TryFrom<RawFont> for Font {
 
         let mut glyphs = BTreeMap::new();
         for raw_glyph in from.glyphs.into_iter() {
-            glyphs.insert(raw_glyph.glyphname.clone(), raw_glyph.build(radix)?);
+            glyphs.insert(
+                raw_glyph.glyphname.clone(),
+                raw_glyph.build(radix, &glyph_data)?,
+            );
         }
 
         let mut features = Vec::new();
@@ -2615,9 +2613,9 @@ mod tests {
             default_master_idx, RawAxisUserToDesignMap, RawFeature, RawFont, RawFontMaster,
             RawUserToDesignMapping,
         },
-        glyphdata::{Category, Subcategory},
+        glyphdata::GlyphData,
         plist::FromPlist,
-        Font, FontMaster, Node, Shape,
+        Category, Font, FontMaster, Node, Shape,
     };
     use std::{
         collections::{BTreeMap, BTreeSet, HashSet},
@@ -3568,9 +3566,11 @@ mod tests {
             ..Default::default()
         };
 
-        let cooked = raw.build(16).unwrap();
-        assert_eq!(cooked.category, Some(Category::Letter));
-        assert_eq!(cooked.sub_category, Subcategory::None);
+        let cooked = raw.build(16, &GlyphData::glyphs_lib_data()).unwrap();
+        assert_eq!(
+            (cooked.category, cooked.sub_category),
+            (Some(Category::Letter), None)
+        );
     }
 
     #[test]