From 8ce37970e3a20562c779ee2f246c42c99e97b7c6 Mon Sep 17 00:00:00 2001
From: phenylshima <49227365+femshima@users.noreply.github.com>
Date: Tue, 6 Feb 2024 20:07:41 +0900
Subject: [PATCH] Fix jlabel parser/serializer bugs (#25)
* fix reversed logic of e5/g5
* remove colon
* fix word formatter
* add new test
* use warning class
---
crates/jlabel/src/fullcontext_label.rs | 7 +++
crates/jlabel/src/parser.rs | 4 +-
crates/jlabel/src/serializer.rs | 23 +++++---
crates/jlabel/tests/fixtures.rs | 73 +++++++++++++++++++++++++-
4 files changed, 96 insertions(+), 11 deletions(-)
diff --git a/crates/jlabel/src/fullcontext_label.rs b/crates/jlabel/src/fullcontext_label.rs
index 85e34fa..817a7e1 100644
--- a/crates/jlabel/src/fullcontext_label.rs
+++ b/crates/jlabel/src/fullcontext_label.rs
@@ -108,6 +108,13 @@ pub struct AccentPhrasePrevNext {
/// E3/G3: whether the accent phrase interrogative or not
pub is_interrogative: bool,
/// E5/G5: whether pause insertion or not in between the accent phrase and the current accent phrase
+ ///
+ ///
+ ///
+ /// The logic of this field is reversed from the E5/G5 of full-context label:
+ /// "1" is false and "0" is true.
+ ///
+ ///
pub is_pause_insertion: Option,
}
diff --git a/crates/jlabel/src/parser.rs b/crates/jlabel/src/parser.rs
index 2ce8f48..38dc32f 100644
--- a/crates/jlabel/src/parser.rs
+++ b/crates/jlabel/src/parser.rs
@@ -174,7 +174,7 @@ impl<'a> LabelTokenizer<'a> {
mora_count: e1,
accent_position: e2,
is_interrogative: e3,
- is_pause_insertion: e5,
+ is_pause_insertion: e5.map(|e5| !e5),
}))
} else {
Ok(None)
@@ -222,7 +222,7 @@ impl<'a> LabelTokenizer<'a> {
mora_count: g1,
accent_position: g2,
is_interrogative: g3,
- is_pause_insertion: g5,
+ is_pause_insertion: g5.map(|g5| !g5),
}))
} else {
Ok(None)
diff --git a/crates/jlabel/src/serializer.rs b/crates/jlabel/src/serializer.rs
index 8dcf503..114b3d7 100644
--- a/crates/jlabel/src/serializer.rs
+++ b/crates/jlabel/src/serializer.rs
@@ -35,6 +35,13 @@ impl<'a, 'b> Serializer<'a, 'b> {
}
}
+ fn d01_or_xx(&mut self, value: &Option) -> Result {
+ match value {
+ Some(v) => write!(self.f, "{:01}", v),
+ None => self.xx(),
+ }
+ }
+
fn d02_or_xx(&mut self, value: &Option) -> Result {
match value {
Some(v) => write!(self.f, "{:02}", v),
@@ -95,9 +102,9 @@ impl<'a, 'b> Serializer<'a, 'b> {
if let Some(word_prev) = word_prev {
self.d02_or_xx(&word_prev.pos)?;
self.f.write_char('-')?;
- self.d02_or_xx(&word_prev.ctype)?;
+ self.d01_or_xx(&word_prev.ctype)?;
self.f.write_char('_')?;
- self.d02_or_xx(&word_prev.cform)?;
+ self.d01_or_xx(&word_prev.cform)?;
} else {
self.all_xx(&['-', '_'])?;
}
@@ -112,9 +119,9 @@ impl<'a, 'b> Serializer<'a, 'b> {
if let Some(word_curr) = word_curr {
self.d02_or_xx(&word_curr.pos)?;
self.f.write_char('_')?;
- self.d02_or_xx(&word_curr.ctype)?;
+ self.d01_or_xx(&word_curr.ctype)?;
self.f.write_char('+')?;
- self.d02_or_xx(&word_curr.cform)?;
+ self.d01_or_xx(&word_curr.cform)?;
} else {
self.all_xx(&['_', '+'])?;
}
@@ -129,9 +136,9 @@ impl<'a, 'b> Serializer<'a, 'b> {
if let Some(word_next) = word_next {
self.d02_or_xx(&word_next.pos)?;
self.f.write_char('+')?;
- self.d02_or_xx(&word_next.ctype)?;
+ self.d01_or_xx(&word_next.ctype)?;
self.f.write_char('_')?;
- self.d02_or_xx(&word_next.cform)?;
+ self.d01_or_xx(&word_next.cform)?;
} else {
self.all_xx(&['+', '_'])?;
}
@@ -152,7 +159,7 @@ impl<'a, 'b> Serializer<'a, 'b> {
self.f.write_char('_')?;
self.xx()?;
self.f.write_char('-')?;
- self.bool_or_xx(&accent_phrase_prev.is_pause_insertion)?;
+ self.bool_or_xx(&accent_phrase_prev.is_pause_insertion.map(|value| !value))?;
} else {
self.all_xx(&['_', '!', '_', '-'])?;
}
@@ -204,7 +211,7 @@ impl<'a, 'b> Serializer<'a, 'b> {
self.f.write_char('_')?;
self.xx()?;
self.f.write_char('_')?;
- self.bool_or_xx(&accent_phrase_next.is_pause_insertion)?;
+ self.bool_or_xx(&accent_phrase_next.is_pause_insertion.map(|value| !value))?;
} else {
self.all_xx(&['_', '%', '_', '_'])?;
}
diff --git a/crates/jlabel/tests/fixtures.rs b/crates/jlabel/tests/fixtures.rs
index a6e921f..a43e68f 100644
--- a/crates/jlabel/tests/fixtures.rs
+++ b/crates/jlabel/tests/fixtures.rs
@@ -3,8 +3,9 @@ use jlabel::{
Mora, Phoneme, Utterance, Word,
};
-pub fn fixtures() -> [(&'static str, Label); 11] {
+pub fn fixtures() -> [(&'static str, Label); 12] {
[
+ // こんにちは
(
"xx^xx-sil+k=o/A:xx+xx+xx/B:xx-xx_xx/C:xx_xx+xx/D:xx+xx_xx/E:xx_xx!xx_xx-xx/F:xx_xx#xx_xx@xx_xx|xx_xx/G:5_5%0_xx_xx/H:xx_xx/I:xx-xx@xx+xx&xx-xx|xx+xx/J:1_5/K:1+1-5",
Label {
@@ -543,5 +544,75 @@ pub fn fixtures() -> [(&'static str, Label); 11] {
},
},
),
+ // 「なにを言っているのですか,それはスマホですよ.」
+ // (partial; 6th phoneme including the first sil)
+ (
+ "n^i-o+i=cl/A:2+3+1/B:04-xx_xx/C:13_xx+xx/D:20+1_1/E:xx_xx!xx_xx-xx/F:3_1#0_xx@1_4|1_12/G:3_3%0_xx_1/H:xx_xx/I:4-12@1+2&1-6|1+21/J:2_9/K:2+6-21",
+ Label {
+ phoneme: Phoneme {
+ p2: Some("n".to_string()),
+ p1: Some("i".to_string()),
+ c: Some("o".to_string()),
+ n1: Some("i".to_string()),
+ n2: Some("cl".to_string()),
+ },
+ mora: Some(Mora {
+ relative_accent_position: 2,
+ position_forward: 3,
+ position_backward: 1,
+ }),
+ word_prev: Some(Word {
+ pos: Some(4),
+ ctype: None,
+ cform: None,
+ }),
+ word_curr: Some(Word {
+ pos: Some(13),
+ ctype: None,
+ cform: None,
+ }),
+ word_next: Some(Word {
+ pos: Some(20),
+ ctype: Some(1),
+ cform: Some(1),
+ }),
+ accent_phrase_prev: None,
+ accent_phrase_curr: Some(AccentPhraseCurrent {
+ mora_count: 3,
+ accent_position: 1,
+ is_interrogative: false,
+ accent_phrase_position_forward: 1,
+ accent_phrase_position_backward: 4,
+ mora_position_forward: 1,
+ mora_position_backward: 12,
+ }),
+ accent_phrase_next: Some(AccentPhrasePrevNext {
+ mora_count: 3,
+ accent_position: 3,
+ is_interrogative: false,
+ is_pause_insertion: Some(false),
+ }),
+ breath_group_prev: None,
+ breath_group_curr: Some(BreathGroupCurrent {
+ accent_phrase_count: 4,
+ mora_count: 12,
+ breath_group_position_forward: 1,
+ breath_group_position_backward: 2,
+ accent_phrase_position_forward: 1,
+ accent_phrase_position_backward: 6,
+ mora_position_forward: 1,
+ mora_position_backward: 21,
+ }),
+ breath_group_next: Some(BreathGroupPrevNext {
+ accent_phrase_count: 2,
+ mora_count: 9,
+ }),
+ utterance: Utterance {
+ breath_group_count: 2,
+ accent_phrase_count: 6,
+ mora_count: 21,
+ },
+ },
+ )
]
}