From 8ce37970e3a20562c779ee2f246c42c99e97b7c6 Mon Sep 17 00:00:00 2001 From: phenylshima <49227365+femshima@users.noreply.github.com> Date: Tue, 6 Feb 2024 20:07:41 +0900 Subject: [PATCH] Fix jlabel parser/serializer bugs (#25) * fix reversed logic of e5/g5 * remove colon * fix word formatter * add new test * use warning class --- crates/jlabel/src/fullcontext_label.rs | 7 +++ crates/jlabel/src/parser.rs | 4 +- crates/jlabel/src/serializer.rs | 23 +++++--- crates/jlabel/tests/fixtures.rs | 73 +++++++++++++++++++++++++- 4 files changed, 96 insertions(+), 11 deletions(-) diff --git a/crates/jlabel/src/fullcontext_label.rs b/crates/jlabel/src/fullcontext_label.rs index 85e34fa..817a7e1 100644 --- a/crates/jlabel/src/fullcontext_label.rs +++ b/crates/jlabel/src/fullcontext_label.rs @@ -108,6 +108,13 @@ pub struct AccentPhrasePrevNext { /// E3/G3: whether the accent phrase interrogative or not pub is_interrogative: bool, /// E5/G5: whether pause insertion or not in between the accent phrase and the current accent phrase + /// + ///
+ /// + /// The logic of this field is reversed from the E5/G5 of full-context label: + /// "1" is false and "0" is true. + /// + ///
pub is_pause_insertion: Option, } diff --git a/crates/jlabel/src/parser.rs b/crates/jlabel/src/parser.rs index 2ce8f48..38dc32f 100644 --- a/crates/jlabel/src/parser.rs +++ b/crates/jlabel/src/parser.rs @@ -174,7 +174,7 @@ impl<'a> LabelTokenizer<'a> { mora_count: e1, accent_position: e2, is_interrogative: e3, - is_pause_insertion: e5, + is_pause_insertion: e5.map(|e5| !e5), })) } else { Ok(None) @@ -222,7 +222,7 @@ impl<'a> LabelTokenizer<'a> { mora_count: g1, accent_position: g2, is_interrogative: g3, - is_pause_insertion: g5, + is_pause_insertion: g5.map(|g5| !g5), })) } else { Ok(None) diff --git a/crates/jlabel/src/serializer.rs b/crates/jlabel/src/serializer.rs index 8dcf503..114b3d7 100644 --- a/crates/jlabel/src/serializer.rs +++ b/crates/jlabel/src/serializer.rs @@ -35,6 +35,13 @@ impl<'a, 'b> Serializer<'a, 'b> { } } + fn d01_or_xx(&mut self, value: &Option) -> Result { + match value { + Some(v) => write!(self.f, "{:01}", v), + None => self.xx(), + } + } + fn d02_or_xx(&mut self, value: &Option) -> Result { match value { Some(v) => write!(self.f, "{:02}", v), @@ -95,9 +102,9 @@ impl<'a, 'b> Serializer<'a, 'b> { if let Some(word_prev) = word_prev { self.d02_or_xx(&word_prev.pos)?; self.f.write_char('-')?; - self.d02_or_xx(&word_prev.ctype)?; + self.d01_or_xx(&word_prev.ctype)?; self.f.write_char('_')?; - self.d02_or_xx(&word_prev.cform)?; + self.d01_or_xx(&word_prev.cform)?; } else { self.all_xx(&['-', '_'])?; } @@ -112,9 +119,9 @@ impl<'a, 'b> Serializer<'a, 'b> { if let Some(word_curr) = word_curr { self.d02_or_xx(&word_curr.pos)?; self.f.write_char('_')?; - self.d02_or_xx(&word_curr.ctype)?; + self.d01_or_xx(&word_curr.ctype)?; self.f.write_char('+')?; - self.d02_or_xx(&word_curr.cform)?; + self.d01_or_xx(&word_curr.cform)?; } else { self.all_xx(&['_', '+'])?; } @@ -129,9 +136,9 @@ impl<'a, 'b> Serializer<'a, 'b> { if let Some(word_next) = word_next { self.d02_or_xx(&word_next.pos)?; self.f.write_char('+')?; - self.d02_or_xx(&word_next.ctype)?; + self.d01_or_xx(&word_next.ctype)?; self.f.write_char('_')?; - self.d02_or_xx(&word_next.cform)?; + self.d01_or_xx(&word_next.cform)?; } else { self.all_xx(&['+', '_'])?; } @@ -152,7 +159,7 @@ impl<'a, 'b> Serializer<'a, 'b> { self.f.write_char('_')?; self.xx()?; self.f.write_char('-')?; - self.bool_or_xx(&accent_phrase_prev.is_pause_insertion)?; + self.bool_or_xx(&accent_phrase_prev.is_pause_insertion.map(|value| !value))?; } else { self.all_xx(&['_', '!', '_', '-'])?; } @@ -204,7 +211,7 @@ impl<'a, 'b> Serializer<'a, 'b> { self.f.write_char('_')?; self.xx()?; self.f.write_char('_')?; - self.bool_or_xx(&accent_phrase_next.is_pause_insertion)?; + self.bool_or_xx(&accent_phrase_next.is_pause_insertion.map(|value| !value))?; } else { self.all_xx(&['_', '%', '_', '_'])?; } diff --git a/crates/jlabel/tests/fixtures.rs b/crates/jlabel/tests/fixtures.rs index a6e921f..a43e68f 100644 --- a/crates/jlabel/tests/fixtures.rs +++ b/crates/jlabel/tests/fixtures.rs @@ -3,8 +3,9 @@ use jlabel::{ Mora, Phoneme, Utterance, Word, }; -pub fn fixtures() -> [(&'static str, Label); 11] { +pub fn fixtures() -> [(&'static str, Label); 12] { [ + // こんにちは ( "xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:5_5%0_xx_xx/H:xx_xx/I:xx-xx@xx+xx&xx-xx|xx+xx/J:1_5/K:1+1-5", Label { @@ -543,5 +544,75 @@ pub fn fixtures() -> [(&'static str, Label); 11] { }, }, ), + // 「なにを言っているのですか,それはスマホですよ.」 + // (partial; 6th phoneme including the first sil) + ( + "n^i-o+i=cl/A:2+3+1/B:04-xx_xx/C:13_xx+xx/D:20+1_1/E:xx_xx!xx_xx-xx/F:3_1#0_xx@1_4|1_12/G:3_3%0_xx_1/H:xx_xx/I:4-12@1+2&1-6|1+21/J:2_9/K:2+6-21", + Label { + phoneme: Phoneme { + p2: Some("n".to_string()), + p1: Some("i".to_string()), + c: Some("o".to_string()), + n1: Some("i".to_string()), + n2: Some("cl".to_string()), + }, + mora: Some(Mora { + relative_accent_position: 2, + position_forward: 3, + position_backward: 1, + }), + word_prev: Some(Word { + pos: Some(4), + ctype: None, + cform: None, + }), + word_curr: Some(Word { + pos: Some(13), + ctype: None, + cform: None, + }), + word_next: Some(Word { + pos: Some(20), + ctype: Some(1), + cform: Some(1), + }), + accent_phrase_prev: None, + accent_phrase_curr: Some(AccentPhraseCurrent { + mora_count: 3, + accent_position: 1, + is_interrogative: false, + accent_phrase_position_forward: 1, + accent_phrase_position_backward: 4, + mora_position_forward: 1, + mora_position_backward: 12, + }), + accent_phrase_next: Some(AccentPhrasePrevNext { + mora_count: 3, + accent_position: 3, + is_interrogative: false, + is_pause_insertion: Some(false), + }), + breath_group_prev: None, + breath_group_curr: Some(BreathGroupCurrent { + accent_phrase_count: 4, + mora_count: 12, + breath_group_position_forward: 1, + breath_group_position_backward: 2, + accent_phrase_position_forward: 1, + accent_phrase_position_backward: 6, + mora_position_forward: 1, + mora_position_backward: 21, + }), + breath_group_next: Some(BreathGroupPrevNext { + accent_phrase_count: 2, + mora_count: 9, + }), + utterance: Utterance { + breath_group_count: 2, + accent_phrase_count: 6, + mora_count: 21, + }, + }, + ) ] }