Skip to content

Commit

Permalink
Rust crate (#158)
Browse files Browse the repository at this point in the history
* Initial commit

* Reduce chunk size to 100, to avoid pathological compile times

* rustfmt

* Loosen deps

* Tests, kind of

* Rename crate

* Use serde_json to store the data
  • Loading branch information
simoncozens authored Nov 8, 2024
1 parent 6e0838b commit a8fa72c
Show file tree
Hide file tree
Showing 3 changed files with 216 additions and 0 deletions.
23 changes: 23 additions & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
[package]
name = "google-fonts-languages"
version = "0.6.3"
edition = "2021"

[dependencies]
bytes = "1.7.1"
prost = "0.13"
serde = { version = "1.0", features = ["derive"] }
serde_json = "1.0"

[build-dependencies]
prost-build = "0.13"
protobuf-support = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" }
protobuf = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" }
protobuf-parse = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" }
glob = "*" # This is a joke.
prettyplease = "0.2"
quote = "1.0"
proc-macro2 = "1.0"
syn = "2.0"
itertools = "0.13"
serde_json = "1.0"
168 changes: 168 additions & 0 deletions build.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,168 @@
use proc_macro2::TokenStream;
use protobuf::reflect::{FieldDescriptor, ReflectValueRef};
use quote::quote;
use serde_json::Map;
use std::io::{BufWriter, Write};
use std::{env, fs::File, path::Path};

fn main() {
// First we load up the descriptor using the protobuf crate
// so that we can do reflection on it.
let descriptors = protobuf_parse::Parser::new()
.pure()
.include(".")
.input("Lib/gflanguages/languages_public.proto")
.file_descriptor_set()
.expect("Could not parse languages_public.proto");
let protofile = descriptors.file.first().expect("No file in descriptor");
let descriptor = protobuf::reflect::FileDescriptor::new_dynamic(protofile.clone(), &[])
.expect("Could not create descriptor");

// Now we use the prost crate to compile them, so that we can
// generate Rust structs.
let mut config = prost_build::Config::new();
// config.boxed(".google.languages_public.LanguageProto.sample_text");
// config.boxed(".google.languages_public.LanguageProto.exemplar_chars");

// The reflection can tell us what messages we have, so we can configure
// them to be deserializable with serde
for message in descriptor.messages() {
config.type_attribute(
message.full_name(),
"#[derive(serde::Serialize, serde::Deserialize)]",
);
}
// Let's make our structs; this produces google.languages_public.rs
config
.compile_protos(
&["Lib/gflanguages/languages_public.proto"],
&["Lib/gflanguages/"],
)
.expect("Could not compile languages_public.proto");

let path = Path::new(&env::var("OUT_DIR").unwrap()).join("data.rs");
let mut file = BufWriter::new(File::create(path).unwrap());
let mut output = quote! { use std::collections::BTreeMap; use std::sync::LazyLock; };

output.extend(serialize_a_structure(
".google.languages_public.RegionProto",
"Lib/gflanguages/data/regions/*.textproto",
"REGIONS",
&descriptor,
));

output.extend(serialize_a_structure(
".google.languages_public.ScriptProto",
"Lib/gflanguages/data/scripts/*.textproto",
"SCRIPTS",
&descriptor,
));

output.extend(serialize_a_structure(
".google.languages_public.LanguageProto",
"Lib/gflanguages/data/languages/*.textproto",
"LANGUAGES",
&descriptor,
));
// file.write_all(output.to_string().as_bytes())
// .expect("Could not write to file");

let abstract_file: syn::File = syn::parse2(output).expect("Could not parse output");
let formatted = prettyplease::unparse(&abstract_file);
file.write_all(formatted.as_bytes())
.expect("Could not write to file");
}

fn serialize_a_structure(
proto_name: &str,
pathglob: &str,
output_variable: &str,
descriptor: &protobuf::reflect::FileDescriptor,
) -> TokenStream {
let proto = descriptor
.message_by_full_name(proto_name)
.unwrap_or_else(|| panic!("No {} message", proto_name));
let files: Vec<std::path::PathBuf> = glob::glob(pathglob)
.expect("Failed to read glob pattern")
.flatten()
.collect();
let name: TokenStream = proto.name().parse().unwrap();
let variable: TokenStream = output_variable.parse().unwrap();
let mut map = Map::new();
for file in files.into_iter() {
serialize_file(file, &proto, &mut map);
}
let json_var: TokenStream = format!("__{}", output_variable).parse().unwrap();
let docmsg = format!("A map of all the {} objects", name);
let json_dump = serde_json::to_string(&map).expect("Could not serialize");
quote! {
static #json_var: &str = #json_dump;

#[doc = #docmsg]
pub static #variable: LazyLock<BTreeMap<String, Box<#name>>> = LazyLock::new(|| {
serde_json::from_str(#json_var).expect("Could not deserialize")
});
}
}
fn serialize_file(
path: std::path::PathBuf,
descriptor: &protobuf::reflect::MessageDescriptor,
value: &mut Map<String, serde_json::Value>,
) {
let mut message = descriptor.new_instance();
let message_mut = message.as_mut();
let input = std::fs::read_to_string(&path).expect("Could not read file");
protobuf::text_format::merge_from_str(message_mut, &input)
.unwrap_or_else(|e| panic!("Could not parse file {:?}: {:?}", path, e));
let id = path.file_stem().unwrap().to_str().unwrap();
value.insert(id.to_string(), serialize_message(message_mut));
}

fn serialize_message(message: &dyn protobuf::MessageDyn) -> serde_json::Value {
let descriptor = message.descriptor_dyn();
// let descriptor_name: TokenStream = descriptor.name().parse().unwrap();
let mut output = Map::new();
for field in descriptor.fields() {
let field_name: TokenStream = field.name().parse().unwrap();
let field_contents = serialize_field(&field, message);
output.insert(field_name.to_string(), field_contents);
}
output.into()
}

fn serialize_field(
field: &FieldDescriptor,
message: &dyn protobuf::MessageDyn,
) -> serde_json::Value {
if field.is_repeated() {
let v: Vec<serde_json::Value> = field
.get_repeated(message)
.into_iter()
.map(|value| serialize_field_value(field, value))
.collect();
v.into()
} else if field.is_required() {
serialize_field_value(field, field.get_singular(message).unwrap())
} else if field.has_field(message) {
let value = serialize_field_value(field, field.get_singular(message).unwrap());
value.into()
} else {
serde_json::Value::Null
}
}

fn serialize_field_value(_field: &FieldDescriptor, value: ReflectValueRef) -> serde_json::Value {
match value {
ReflectValueRef::Bool(value) => value.into(),
ReflectValueRef::I32(value) => value.into(),
ReflectValueRef::I64(value) => value.into(),
ReflectValueRef::U32(value) => value.into(),
ReflectValueRef::U64(value) => value.into(),
ReflectValueRef::F32(value) => value.into(),
ReflectValueRef::F64(value) => value.into(),
ReflectValueRef::String(value) => value.into(),
ReflectValueRef::Bytes(value) => value.into(),
ReflectValueRef::Enum(_value, _ix) => unimplemented!(),
ReflectValueRef::Message(value) => serialize_message(&*value),
}
}
25 changes: 25 additions & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,25 @@
include!(concat!(env!("OUT_DIR"), "/google.languages_public.rs"));
include!(concat!(env!("OUT_DIR"), "/data.rs"));

#[cfg(test)]
mod tests {
use super::*;

#[test]
fn regions() {
assert!((*REGIONS).contains_key("BG"));
assert_eq!(REGIONS.get("BG").unwrap().name.as_deref(), Some("Bulgaria"));
}

#[test]
fn scripts() {
assert!((*SCRIPTS).contains_key("Arab"));
assert_eq!(SCRIPTS.get("Arab").unwrap().name.as_deref(), Some("Arabic"));
}

#[test]
fn languages() {
assert!(LANGUAGES.len() > 1000);
assert!((*LANGUAGES).contains_key("ar_Arab"));
}
}

0 comments on commit a8fa72c

Please sign in to comment.