-
Notifications
You must be signed in to change notification settings - Fork 18
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
* Initial commit * Reduce chunk size to 100, to avoid pathological compile times * rustfmt * Loosen deps * Tests, kind of * Rename crate * Use serde_json to store the data
- Loading branch information
1 parent
6e0838b
commit a8fa72c
Showing
3 changed files
with
216 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,23 @@ | ||
[package] | ||
name = "google-fonts-languages" | ||
version = "0.6.3" | ||
edition = "2021" | ||
|
||
[dependencies] | ||
bytes = "1.7.1" | ||
prost = "0.13" | ||
serde = { version = "1.0", features = ["derive"] } | ||
serde_json = "1.0" | ||
|
||
[build-dependencies] | ||
prost-build = "0.13" | ||
protobuf-support = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" } | ||
protobuf = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" } | ||
protobuf-parse = { git = "https://github.com/cmyr/rust-protobuf", branch = "parse-unicode-strings" } | ||
glob = "*" # This is a joke. | ||
prettyplease = "0.2" | ||
quote = "1.0" | ||
proc-macro2 = "1.0" | ||
syn = "2.0" | ||
itertools = "0.13" | ||
serde_json = "1.0" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,168 @@ | ||
use proc_macro2::TokenStream; | ||
use protobuf::reflect::{FieldDescriptor, ReflectValueRef}; | ||
use quote::quote; | ||
use serde_json::Map; | ||
use std::io::{BufWriter, Write}; | ||
use std::{env, fs::File, path::Path}; | ||
|
||
fn main() { | ||
// First we load up the descriptor using the protobuf crate | ||
// so that we can do reflection on it. | ||
let descriptors = protobuf_parse::Parser::new() | ||
.pure() | ||
.include(".") | ||
.input("Lib/gflanguages/languages_public.proto") | ||
.file_descriptor_set() | ||
.expect("Could not parse languages_public.proto"); | ||
let protofile = descriptors.file.first().expect("No file in descriptor"); | ||
let descriptor = protobuf::reflect::FileDescriptor::new_dynamic(protofile.clone(), &[]) | ||
.expect("Could not create descriptor"); | ||
|
||
// Now we use the prost crate to compile them, so that we can | ||
// generate Rust structs. | ||
let mut config = prost_build::Config::new(); | ||
// config.boxed(".google.languages_public.LanguageProto.sample_text"); | ||
// config.boxed(".google.languages_public.LanguageProto.exemplar_chars"); | ||
|
||
// The reflection can tell us what messages we have, so we can configure | ||
// them to be deserializable with serde | ||
for message in descriptor.messages() { | ||
config.type_attribute( | ||
message.full_name(), | ||
"#[derive(serde::Serialize, serde::Deserialize)]", | ||
); | ||
} | ||
// Let's make our structs; this produces google.languages_public.rs | ||
config | ||
.compile_protos( | ||
&["Lib/gflanguages/languages_public.proto"], | ||
&["Lib/gflanguages/"], | ||
) | ||
.expect("Could not compile languages_public.proto"); | ||
|
||
let path = Path::new(&env::var("OUT_DIR").unwrap()).join("data.rs"); | ||
let mut file = BufWriter::new(File::create(path).unwrap()); | ||
let mut output = quote! { use std::collections::BTreeMap; use std::sync::LazyLock; }; | ||
|
||
output.extend(serialize_a_structure( | ||
".google.languages_public.RegionProto", | ||
"Lib/gflanguages/data/regions/*.textproto", | ||
"REGIONS", | ||
&descriptor, | ||
)); | ||
|
||
output.extend(serialize_a_structure( | ||
".google.languages_public.ScriptProto", | ||
"Lib/gflanguages/data/scripts/*.textproto", | ||
"SCRIPTS", | ||
&descriptor, | ||
)); | ||
|
||
output.extend(serialize_a_structure( | ||
".google.languages_public.LanguageProto", | ||
"Lib/gflanguages/data/languages/*.textproto", | ||
"LANGUAGES", | ||
&descriptor, | ||
)); | ||
// file.write_all(output.to_string().as_bytes()) | ||
// .expect("Could not write to file"); | ||
|
||
let abstract_file: syn::File = syn::parse2(output).expect("Could not parse output"); | ||
let formatted = prettyplease::unparse(&abstract_file); | ||
file.write_all(formatted.as_bytes()) | ||
.expect("Could not write to file"); | ||
} | ||
|
||
fn serialize_a_structure( | ||
proto_name: &str, | ||
pathglob: &str, | ||
output_variable: &str, | ||
descriptor: &protobuf::reflect::FileDescriptor, | ||
) -> TokenStream { | ||
let proto = descriptor | ||
.message_by_full_name(proto_name) | ||
.unwrap_or_else(|| panic!("No {} message", proto_name)); | ||
let files: Vec<std::path::PathBuf> = glob::glob(pathglob) | ||
.expect("Failed to read glob pattern") | ||
.flatten() | ||
.collect(); | ||
let name: TokenStream = proto.name().parse().unwrap(); | ||
let variable: TokenStream = output_variable.parse().unwrap(); | ||
let mut map = Map::new(); | ||
for file in files.into_iter() { | ||
serialize_file(file, &proto, &mut map); | ||
} | ||
let json_var: TokenStream = format!("__{}", output_variable).parse().unwrap(); | ||
let docmsg = format!("A map of all the {} objects", name); | ||
let json_dump = serde_json::to_string(&map).expect("Could not serialize"); | ||
quote! { | ||
static #json_var: &str = #json_dump; | ||
|
||
#[doc = #docmsg] | ||
pub static #variable: LazyLock<BTreeMap<String, Box<#name>>> = LazyLock::new(|| { | ||
serde_json::from_str(#json_var).expect("Could not deserialize") | ||
}); | ||
} | ||
} | ||
fn serialize_file( | ||
path: std::path::PathBuf, | ||
descriptor: &protobuf::reflect::MessageDescriptor, | ||
value: &mut Map<String, serde_json::Value>, | ||
) { | ||
let mut message = descriptor.new_instance(); | ||
let message_mut = message.as_mut(); | ||
let input = std::fs::read_to_string(&path).expect("Could not read file"); | ||
protobuf::text_format::merge_from_str(message_mut, &input) | ||
.unwrap_or_else(|e| panic!("Could not parse file {:?}: {:?}", path, e)); | ||
let id = path.file_stem().unwrap().to_str().unwrap(); | ||
value.insert(id.to_string(), serialize_message(message_mut)); | ||
} | ||
|
||
fn serialize_message(message: &dyn protobuf::MessageDyn) -> serde_json::Value { | ||
let descriptor = message.descriptor_dyn(); | ||
// let descriptor_name: TokenStream = descriptor.name().parse().unwrap(); | ||
let mut output = Map::new(); | ||
for field in descriptor.fields() { | ||
let field_name: TokenStream = field.name().parse().unwrap(); | ||
let field_contents = serialize_field(&field, message); | ||
output.insert(field_name.to_string(), field_contents); | ||
} | ||
output.into() | ||
} | ||
|
||
fn serialize_field( | ||
field: &FieldDescriptor, | ||
message: &dyn protobuf::MessageDyn, | ||
) -> serde_json::Value { | ||
if field.is_repeated() { | ||
let v: Vec<serde_json::Value> = field | ||
.get_repeated(message) | ||
.into_iter() | ||
.map(|value| serialize_field_value(field, value)) | ||
.collect(); | ||
v.into() | ||
} else if field.is_required() { | ||
serialize_field_value(field, field.get_singular(message).unwrap()) | ||
} else if field.has_field(message) { | ||
let value = serialize_field_value(field, field.get_singular(message).unwrap()); | ||
value.into() | ||
} else { | ||
serde_json::Value::Null | ||
} | ||
} | ||
|
||
fn serialize_field_value(_field: &FieldDescriptor, value: ReflectValueRef) -> serde_json::Value { | ||
match value { | ||
ReflectValueRef::Bool(value) => value.into(), | ||
ReflectValueRef::I32(value) => value.into(), | ||
ReflectValueRef::I64(value) => value.into(), | ||
ReflectValueRef::U32(value) => value.into(), | ||
ReflectValueRef::U64(value) => value.into(), | ||
ReflectValueRef::F32(value) => value.into(), | ||
ReflectValueRef::F64(value) => value.into(), | ||
ReflectValueRef::String(value) => value.into(), | ||
ReflectValueRef::Bytes(value) => value.into(), | ||
ReflectValueRef::Enum(_value, _ix) => unimplemented!(), | ||
ReflectValueRef::Message(value) => serialize_message(&*value), | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,25 @@ | ||
include!(concat!(env!("OUT_DIR"), "/google.languages_public.rs")); | ||
include!(concat!(env!("OUT_DIR"), "/data.rs")); | ||
|
||
#[cfg(test)] | ||
mod tests { | ||
use super::*; | ||
|
||
#[test] | ||
fn regions() { | ||
assert!((*REGIONS).contains_key("BG")); | ||
assert_eq!(REGIONS.get("BG").unwrap().name.as_deref(), Some("Bulgaria")); | ||
} | ||
|
||
#[test] | ||
fn scripts() { | ||
assert!((*SCRIPTS).contains_key("Arab")); | ||
assert_eq!(SCRIPTS.get("Arab").unwrap().name.as_deref(), Some("Arabic")); | ||
} | ||
|
||
#[test] | ||
fn languages() { | ||
assert!(LANGUAGES.len() > 1000); | ||
assert!((*LANGUAGES).contains_key("ar_Arab")); | ||
} | ||
} |