-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #4 from jpreprocess/parser
implement parser for label
- Loading branch information
Showing
5 changed files
with
936 additions
and
0 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -6,3 +6,4 @@ edition = "2021" | |
rust-version = "1.65.0" | ||
|
||
[dependencies] | ||
thiserror = "1.0.56" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1 +1,2 @@ | ||
mod fullcontext_label; | ||
mod parser; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,326 @@ | ||
use std::{num::ParseIntError, str::FromStr}; | ||
|
||
use crate::fullcontext_label::{ | ||
AccentPhraseCurrent, AccentPhrasePrevNext, BreathGroupCurrent, BreathGroupPrevNext, Label, | ||
Mora, Phoneme, Utterance, Word, | ||
}; | ||
|
||
#[derive(Debug, thiserror::Error)] | ||
pub enum ParseError { | ||
#[error("Symbol not found: expected {0}")] | ||
SymbolNotFound(&'static str), | ||
#[error("Parse int error: {0}")] | ||
ParseIntError(#[from] ParseIntError), | ||
#[error("Parse bool error")] | ||
ParseBoolError, | ||
#[error("Not undefined")] | ||
NotUndefined, | ||
} | ||
|
||
#[derive(Debug)] | ||
struct LabelTokenizer<'a> { | ||
input: &'a str, | ||
index: usize, | ||
} | ||
|
||
impl<'a> LabelTokenizer<'a> { | ||
fn new(input: &'a str) -> Self { | ||
Self { input, index: 0 } | ||
} | ||
|
||
fn until(&mut self, symbol: &'static str) -> Result<&'a str, ParseError> { | ||
match self.input[self.index..].find(symbol) { | ||
Some(i) => { | ||
let result = &self.input[self.index..(self.index + i)]; | ||
self.index += i + symbol.len(); | ||
Ok(result) | ||
} | ||
None => Err(ParseError::SymbolNotFound(symbol)), | ||
} | ||
} | ||
|
||
fn string_or_xx(input: &'a str) -> Option<String> { | ||
if input == "xx" { | ||
None | ||
} else { | ||
Some(input.to_string()) | ||
} | ||
} | ||
|
||
fn parse_or_xx<T: FromStr>(input: &'a str) -> Result<Option<T>, T::Err> { | ||
if input == "xx" { | ||
Ok(None) | ||
} else { | ||
input.parse().map(Some) | ||
} | ||
} | ||
|
||
fn parse_bool_or_xx(input: &'a str) -> Result<Option<bool>, ParseError> { | ||
match input { | ||
"xx" => Ok(None), | ||
"0" => Ok(Some(false)), | ||
"1" => Ok(Some(true)), | ||
_ => Err(ParseError::ParseBoolError), | ||
} | ||
} | ||
|
||
fn assert_xx(input: &'a str) -> Result<(), ParseError> { | ||
if input == "xx" { | ||
Ok(()) | ||
} else { | ||
Err(ParseError::NotUndefined) | ||
} | ||
} | ||
|
||
/// `p1ˆp2-p3+p4=p5` | ||
fn p(&mut self) -> Result<Phoneme, ParseError> { | ||
let p1 = Self::string_or_xx(self.until("^")?); | ||
let p2 = Self::string_or_xx(self.until("-")?); | ||
let p3 = Self::string_or_xx(self.until("+")?); | ||
let p4 = Self::string_or_xx(self.until("=")?); | ||
let p5 = Self::string_or_xx(self.until("/A:")?); | ||
Ok(Phoneme { | ||
p2: p1, | ||
p1: p2, | ||
c: p3, | ||
n1: p4, | ||
n2: p5, | ||
}) | ||
} | ||
|
||
/// `/A:a1+a2+a3` | ||
fn a(&mut self) -> Result<Option<Mora>, ParseError> { | ||
let a1 = Self::parse_or_xx(self.until("+")?)?; | ||
let a2 = Self::parse_or_xx(self.until("+")?)?; | ||
let a3 = Self::parse_or_xx(self.until("/B:")?)?; | ||
|
||
if let (Some(a1), Some(a2), Some(a3)) = (a1, a2, a3) { | ||
Ok(Some(Mora { | ||
relative_accent_position: a1, | ||
position_forward: a2, | ||
position_backward: a3, | ||
})) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// `/B:b1-b2_b3` | ||
fn b(&mut self) -> Result<Option<Word>, ParseError> { | ||
let b1 = Self::parse_or_xx(self.until("-")?)?; | ||
let b2 = Self::parse_or_xx(self.until("_")?)?; | ||
let b3 = Self::parse_or_xx(self.until("/C:")?)?; | ||
|
||
if [b1, b2, b3].iter().all(Option::is_none) { | ||
Ok(None) | ||
} else { | ||
Ok(Some(Word { | ||
pos: b1, | ||
ctype: b2, | ||
cform: b3, | ||
})) | ||
} | ||
} | ||
|
||
/// `/C:c1_c2+c3` | ||
fn c(&mut self) -> Result<Option<Word>, ParseError> { | ||
let c1 = Self::parse_or_xx(self.until("_")?)?; | ||
let c2 = Self::parse_or_xx(self.until("+")?)?; | ||
let c3 = Self::parse_or_xx(self.until("/D:")?)?; | ||
|
||
if [c1, c2, c3].iter().all(Option::is_none) { | ||
Ok(None) | ||
} else { | ||
Ok(Some(Word { | ||
pos: c1, | ||
ctype: c2, | ||
cform: c3, | ||
})) | ||
} | ||
} | ||
|
||
/// `/D:d1+d2_d3` | ||
fn d(&mut self) -> Result<Option<Word>, ParseError> { | ||
let d1 = Self::parse_or_xx(self.until("+")?)?; | ||
let d2 = Self::parse_or_xx(self.until("_")?)?; | ||
let d3 = Self::parse_or_xx(self.until("/E:")?)?; | ||
|
||
if [d1, d2, d3].iter().all(Option::is_none) { | ||
Ok(None) | ||
} else { | ||
Ok(Some(Word { | ||
pos: d1, | ||
ctype: d2, | ||
cform: d3, | ||
})) | ||
} | ||
} | ||
|
||
/// `/E:e1_e2!e3_e4-e5` | ||
fn e(&mut self) -> Result<Option<AccentPhrasePrevNext>, ParseError> { | ||
let e1 = Self::parse_or_xx(self.until("_")?)?; | ||
let e2 = Self::parse_or_xx(self.until("!")?)?; | ||
let e3 = Self::parse_bool_or_xx(self.until("_")?)?; | ||
Self::assert_xx(self.until("-")?)?; | ||
let e5 = Self::parse_bool_or_xx(self.until("/F:")?)?; | ||
|
||
if let (Some(e1), Some(e2), Some(e3)) = (e1, e2, e3) { | ||
Ok(Some(AccentPhrasePrevNext { | ||
mora_count: e1, | ||
accent_position: e2, | ||
is_interrogative: e3, | ||
is_pause_insertion: e5, | ||
})) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// `/F:f1_f2#_f3_f4@_f5_f6|f7_f8` | ||
fn f(&mut self) -> Result<Option<AccentPhraseCurrent>, ParseError> { | ||
let f1 = Self::parse_or_xx(self.until("_")?)?; | ||
let f2 = Self::parse_or_xx(self.until("#")?)?; | ||
let f3 = Self::parse_bool_or_xx(self.until("_")?)?; | ||
Self::assert_xx(self.until("@")?)?; | ||
let f5 = Self::parse_or_xx(self.until("_")?)?; | ||
let f6 = Self::parse_or_xx(self.until("|")?)?; | ||
let f7 = Self::parse_or_xx(self.until("_")?)?; | ||
let f8 = Self::parse_or_xx(self.until("/G:")?)?; | ||
|
||
if let (Some(f1), Some(f2), Some(f3), Some(f5), Some(f6), Some(f7), Some(f8)) = | ||
(f1, f2, f3, f5, f6, f7, f8) | ||
{ | ||
Ok(Some(AccentPhraseCurrent { | ||
mora_count: f1, | ||
accent_position: f2, | ||
is_interrogative: f3, | ||
accent_phrase_position_forward: f5, | ||
accent_phrase_position_backward: f6, | ||
mora_position_forward: f7, | ||
mora_position_backward: f8, | ||
})) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// `/G:g1_g2%g3_g4_g5` | ||
fn g(&mut self) -> Result<Option<AccentPhrasePrevNext>, ParseError> { | ||
let g1 = Self::parse_or_xx(self.until("_")?)?; | ||
let g2 = Self::parse_or_xx(self.until("%")?)?; | ||
let g3 = Self::parse_bool_or_xx(self.until("_")?)?; | ||
Self::assert_xx(self.until("_")?)?; | ||
let g5 = Self::parse_bool_or_xx(self.until("/H:")?)?; | ||
|
||
if let (Some(g1), Some(g2), Some(g3)) = (g1, g2, g3) { | ||
Ok(Some(AccentPhrasePrevNext { | ||
mora_count: g1, | ||
accent_position: g2, | ||
is_interrogative: g3, | ||
is_pause_insertion: g5, | ||
})) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// `/H:h1_h2` | ||
fn h(&mut self) -> Result<Option<BreathGroupPrevNext>, ParseError> { | ||
let h1 = Self::parse_or_xx(self.until("_")?)?; | ||
let h2 = Self::parse_or_xx(self.until("/I:")?)?; | ||
|
||
if let (Some(h1), Some(h2)) = (h1, h2) { | ||
Ok(Some(BreathGroupPrevNext { | ||
accent_phrase_count: h1, | ||
mora_count: h2, | ||
})) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// `/I:i1-i2@i3+i4&i5-i6|i7+i8` | ||
fn i(&mut self) -> Result<Option<BreathGroupCurrent>, ParseError> { | ||
let i1 = Self::parse_or_xx(self.until("-")?)?; | ||
let i2 = Self::parse_or_xx(self.until("@")?)?; | ||
let i3 = Self::parse_or_xx(self.until("+")?)?; | ||
let i4 = Self::parse_or_xx(self.until("&")?)?; | ||
let i5 = Self::parse_or_xx(self.until("-")?)?; | ||
let i6 = Self::parse_or_xx(self.until("|")?)?; | ||
let i7 = Self::parse_or_xx(self.until("+")?)?; | ||
let i8 = Self::parse_or_xx(self.until("/J:")?)?; | ||
|
||
if let (Some(i1), Some(i2), Some(i3), Some(i4), Some(i5), Some(i6), Some(i7), Some(i8)) = | ||
(i1, i2, i3, i4, i5, i6, i7, i8) | ||
{ | ||
Ok(Some(BreathGroupCurrent { | ||
accent_phrase_count: i1, | ||
mora_count: i2, | ||
breath_group_position_forward: i3, | ||
breath_group_position_backward: i4, | ||
accent_phrase_position_forward: i5, | ||
accent_phrase_position_backward: i6, | ||
mora_position_forward: i7, | ||
mora_position_backward: i8, | ||
})) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// `/J:j1_j2` | ||
fn j(&mut self) -> Result<Option<BreathGroupPrevNext>, ParseError> { | ||
let j1 = Self::parse_or_xx(self.until("_")?)?; | ||
let j2 = Self::parse_or_xx(self.until("/K:")?)?; | ||
|
||
if let (Some(j1), Some(j2)) = (j1, j2) { | ||
Ok(Some(BreathGroupPrevNext { | ||
accent_phrase_count: j1, | ||
mora_count: j2, | ||
})) | ||
} else { | ||
Ok(None) | ||
} | ||
} | ||
|
||
/// `/K:k1+k2-k3` | ||
fn k(&mut self) -> Result<Utterance, ParseError> { | ||
let k1 = self.until("+")?.parse()?; | ||
let k2 = self.until("-")?.parse()?; | ||
let k3 = self.input[self.index..].parse()?; | ||
|
||
Ok(Utterance { | ||
breath_group_count: k1, | ||
accent_phrase_count: k2, | ||
mora_count: k3, | ||
}) | ||
} | ||
|
||
fn consume(mut self) -> Result<Label, ParseError> { | ||
Ok(Label { | ||
phoneme: self.p()?, | ||
mora: self.a()?, | ||
word_prev: self.b()?, | ||
word_curr: self.c()?, | ||
word_next: self.d()?, | ||
accent_phrase_prev: self.e()?, | ||
accent_phrase_curr: self.f()?, | ||
accent_phrase_next: self.g()?, | ||
breath_group_prev: self.h()?, | ||
breath_group_curr: self.i()?, | ||
breath_group_next: self.j()?, | ||
utterance: self.k()?, | ||
}) | ||
} | ||
} | ||
|
||
impl FromStr for Label { | ||
type Err = ParseError; | ||
|
||
fn from_str(s: &str) -> Result<Self, Self::Err> { | ||
LabelTokenizer::new(s).consume() | ||
} | ||
} | ||
|
||
#[cfg(test)] | ||
mod test; |
Oops, something went wrong.