From 8ce37970e3a20562c779ee2f246c42c99e97b7c6 Mon Sep 17 00:00:00 2001
From: phenylshima <49227365+femshima@users.noreply.github.com>
Date: Tue, 6 Feb 2024 20:07:41 +0900
Subject: [PATCH] Fix jlabel parser/serializer bugs (#25)

* fix reversed logic of e5/g5

* remove colon

* fix word formatter

* add new test

* use warning class
---
 crates/jlabel/src/fullcontext_label.rs |  7 +++
 crates/jlabel/src/parser.rs            |  4 +-
 crates/jlabel/src/serializer.rs        | 23 +++++---
 crates/jlabel/tests/fixtures.rs        | 73 +++++++++++++++++++++++++-
 4 files changed, 96 insertions(+), 11 deletions(-)
diff --git a/crates/jlabel/src/fullcontext_label.rs b/crates/jlabel/src/fullcontext_label.rs
index 85e34fa..817a7e1 100644
--- a/crates/jlabel/src/fullcontext_label.rs
+++ b/crates/jlabel/src/fullcontext_label.rs
@@ -108,6 +108,13 @@ pub struct AccentPhrasePrevNext {
     /// E3/G3: whether the accent phrase interrogative or not
     pub is_interrogative: bool,
     /// E5/G5: whether pause insertion or not in between the accent phrase and the current accent phrase
+    ///
+    /// <div class="warning">
+    ///
+    /// The logic of this field is reversed from the E5/G5 of full-context label:
+    /// "1" is false and "0" is true.
+    ///
+    /// </div>
     pub is_pause_insertion: Option<bool>,
 }
 
diff --git a/crates/jlabel/src/parser.rs b/crates/jlabel/src/parser.rs
index 2ce8f48..38dc32f 100644
--- a/crates/jlabel/src/parser.rs
+++ b/crates/jlabel/src/parser.rs
@@ -174,7 +174,7 @@ impl<'a> LabelTokenizer<'a> {
                 mora_count: e1,
                 accent_position: e2,
                 is_interrogative: e3,
-                is_pause_insertion: e5,
+                is_pause_insertion: e5.map(|e5| !e5),
             }))
         } else {
             Ok(None)
@@ -222,7 +222,7 @@ impl<'a> LabelTokenizer<'a> {
                 mora_count: g1,
                 accent_position: g2,
                 is_interrogative: g3,
-                is_pause_insertion: g5,
+                is_pause_insertion: g5.map(|g5| !g5),
             }))
         } else {
             Ok(None)
diff --git a/crates/jlabel/src/serializer.rs b/crates/jlabel/src/serializer.rs
index 8dcf503..114b3d7 100644
--- a/crates/jlabel/src/serializer.rs
+++ b/crates/jlabel/src/serializer.rs
@@ -35,6 +35,13 @@ impl<'a, 'b> Serializer<'a, 'b> {
         }
     }
 
+    fn d01_or_xx<T: Display>(&mut self, value: &Option<T>) -> Result {
+        match value {
+            Some(v) => write!(self.f, "{:01}", v),
+            None => self.xx(),
+        }
+    }
+
     fn d02_or_xx<T: Display>(&mut self, value: &Option<T>) -> Result {
         match value {
             Some(v) => write!(self.f, "{:02}", v),
@@ -95,9 +102,9 @@ impl<'a, 'b> Serializer<'a, 'b> {
         if let Some(word_prev) = word_prev {
             self.d02_or_xx(&word_prev.pos)?;
             self.f.write_char('-')?;
-            self.d02_or_xx(&word_prev.ctype)?;
+            self.d01_or_xx(&word_prev.ctype)?;
             self.f.write_char('_')?;
-            self.d02_or_xx(&word_prev.cform)?;
+            self.d01_or_xx(&word_prev.cform)?;
         } else {
             self.all_xx(&['-', '_'])?;
         }
@@ -112,9 +119,9 @@ impl<'a, 'b> Serializer<'a, 'b> {
         if let Some(word_curr) = word_curr {
             self.d02_or_xx(&word_curr.pos)?;
             self.f.write_char('_')?;
-            self.d02_or_xx(&word_curr.ctype)?;
+            self.d01_or_xx(&word_curr.ctype)?;
             self.f.write_char('+')?;
-            self.d02_or_xx(&word_curr.cform)?;
+            self.d01_or_xx(&word_curr.cform)?;
         } else {
             self.all_xx(&['_', '+'])?;
         }
@@ -129,9 +136,9 @@ impl<'a, 'b> Serializer<'a, 'b> {
         if let Some(word_next) = word_next {
             self.d02_or_xx(&word_next.pos)?;
             self.f.write_char('+')?;
-            self.d02_or_xx(&word_next.ctype)?;
+            self.d01_or_xx(&word_next.ctype)?;
             self.f.write_char('_')?;
-            self.d02_or_xx(&word_next.cform)?;
+            self.d01_or_xx(&word_next.cform)?;
         } else {
             self.all_xx(&['+', '_'])?;
         }
@@ -152,7 +159,7 @@ impl<'a, 'b> Serializer<'a, 'b> {
             self.f.write_char('_')?;
             self.xx()?;
             self.f.write_char('-')?;
-            self.bool_or_xx(&accent_phrase_prev.is_pause_insertion)?;
+            self.bool_or_xx(&accent_phrase_prev.is_pause_insertion.map(|value| !value))?;
         } else {
             self.all_xx(&['_', '!', '_', '-'])?;
         }
@@ -204,7 +211,7 @@ impl<'a, 'b> Serializer<'a, 'b> {
             self.f.write_char('_')?;
             self.xx()?;
             self.f.write_char('_')?;
-            self.bool_or_xx(&accent_phrase_next.is_pause_insertion)?;
+            self.bool_or_xx(&accent_phrase_next.is_pause_insertion.map(|value| !value))?;
         } else {
             self.all_xx(&['_', '%', '_', '_'])?;
         }
diff --git a/crates/jlabel/tests/fixtures.rs b/crates/jlabel/tests/fixtures.rs
index a6e921f..a43e68f 100644
--- a/crates/jlabel/tests/fixtures.rs
+++ b/crates/jlabel/tests/fixtures.rs
@@ -3,8 +3,9 @@ use jlabel::{
     Mora, Phoneme, Utterance, Word,
 };
 
-pub fn fixtures() -> [(&'static str, Label); 11] {
+pub fn fixtures() -> [(&'static str, Label); 12] {
     [
+        // こんにちは
         (
             "xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:5_5%0_xx_xx/H:xx_xx/I:xx-xx@xx+xx&xx-xx|xx+xx/J:1_5/K:1+1-5",
             Label {
@@ -543,5 +544,75 @@ pub fn fixtures() -> [(&'static str, Label); 11] {
                 },
             },
         ),
+        // 「なにを言っているのですか，それはスマホですよ．」
+        // (partial; 6th phoneme including the first sil)
+        (
+            "n^i-o+i=cl/A:2+3+1/B:04-xx_xx/C:13_xx+xx/D:20+1_1/E:xx_xx!xx_xx-xx/F:3_1#0_xx@1_4|1_12/G:3_3%0_xx_1/H:xx_xx/I:4-12@1+2&1-6|1+21/J:2_9/K:2+6-21",
+            Label {
+                phoneme: Phoneme {
+                    p2: Some("n".to_string()),
+                    p1: Some("i".to_string()),
+                    c: Some("o".to_string()),
+                    n1: Some("i".to_string()),
+                    n2: Some("cl".to_string()),
+                },
+                mora: Some(Mora {
+                    relative_accent_position: 2,
+                    position_forward: 3,
+                    position_backward: 1,
+                }),
+                word_prev: Some(Word {
+                    pos: Some(4),
+                    ctype: None,
+                    cform: None,
+                }),
+                word_curr: Some(Word {
+                    pos: Some(13),
+                    ctype: None,
+                    cform: None,
+                }),
+                word_next: Some(Word {
+                    pos: Some(20),
+                    ctype: Some(1),
+                    cform: Some(1),
+                }),
+                accent_phrase_prev: None,
+                accent_phrase_curr: Some(AccentPhraseCurrent {
+                    mora_count: 3,
+                    accent_position: 1,
+                    is_interrogative: false,
+                    accent_phrase_position_forward: 1,
+                    accent_phrase_position_backward: 4,
+                    mora_position_forward: 1,
+                    mora_position_backward: 12,
+                }),
+                accent_phrase_next: Some(AccentPhrasePrevNext {
+                    mora_count: 3,
+                    accent_position: 3,
+                    is_interrogative: false,
+                    is_pause_insertion: Some(false),
+                }),
+                breath_group_prev: None,
+                breath_group_curr: Some(BreathGroupCurrent {
+                    accent_phrase_count: 4,
+                    mora_count: 12,
+                    breath_group_position_forward: 1,
+                    breath_group_position_backward: 2,
+                    accent_phrase_position_forward: 1,
+                    accent_phrase_position_backward: 6,
+                    mora_position_forward: 1,
+                    mora_position_backward: 21,
+                }),
+                breath_group_next: Some(BreathGroupPrevNext {
+                    accent_phrase_count: 2,
+                    mora_count: 9,
+                }),
+                utterance: Utterance {
+                    breath_group_count: 2,
+                    accent_phrase_count: 6,
+                    mora_count: 21,
+                },
+            },
+        )
     ]
 }