Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

implement parser for label #4

Merged
merged 3 commits into from
Jan 15, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
58 changes: 58 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -6,3 +6,4 @@ edition = "2021"
rust-version = "1.65.0"

[dependencies]
thiserror = "1.0.56"
1 change: 1 addition & 0 deletions src/lib.rs
Original file line number Diff line number Diff line change
@@ -1 +1,2 @@
mod fullcontext_label;
mod parser;
326 changes: 326 additions & 0 deletions src/parser/mod.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,326 @@
use std::{num::ParseIntError, str::FromStr};

use crate::fullcontext_label::{
AccentPhraseCurrent, AccentPhrasePrevNext, BreathGroupCurrent, BreathGroupPrevNext, Label,
Mora, Phoneme, Utterance, Word,
};

#[derive(Debug, thiserror::Error)]
pub enum ParseError {
#[error("Symbol not found: expected {0}")]
SymbolNotFound(&'static str),
#[error("Parse int error: {0}")]
ParseIntError(#[from] ParseIntError),
#[error("Parse bool error")]
ParseBoolError,
#[error("Not undefined")]
NotUndefined,
}

#[derive(Debug)]
struct LabelTokenizer<'a> {
input: &'a str,
index: usize,
}

impl<'a> LabelTokenizer<'a> {
fn new(input: &'a str) -> Self {
Self { input, index: 0 }
}

fn until(&mut self, symbol: &'static str) -> Result<&'a str, ParseError> {
match self.input[self.index..].find(symbol) {
Some(i) => {
let result = &self.input[self.index..(self.index + i)];
self.index += i + symbol.len();
Ok(result)
}
None => Err(ParseError::SymbolNotFound(symbol)),
}
}

fn string_or_xx(input: &'a str) -> Option<String> {
if input == "xx" {
None
} else {
Some(input.to_string())
}
}

fn parse_or_xx<T: FromStr>(input: &'a str) -> Result<Option<T>, T::Err> {
if input == "xx" {
Ok(None)
} else {
input.parse().map(Some)
}
}

fn parse_bool_or_xx(input: &'a str) -> Result<Option<bool>, ParseError> {
match input {
"xx" => Ok(None),
"0" => Ok(Some(false)),
"1" => Ok(Some(true)),
_ => Err(ParseError::ParseBoolError),
}
}

fn assert_xx(input: &'a str) -> Result<(), ParseError> {
if input == "xx" {
Ok(())
} else {
Err(ParseError::NotUndefined)
}
}

/// `p1ˆp2-p3+p4=p5`
fn p(&mut self) -> Result<Phoneme, ParseError> {
let p1 = Self::string_or_xx(self.until("^")?);
let p2 = Self::string_or_xx(self.until("-")?);
let p3 = Self::string_or_xx(self.until("+")?);
let p4 = Self::string_or_xx(self.until("=")?);
let p5 = Self::string_or_xx(self.until("/A:")?);
Ok(Phoneme {
p2: p1,
p1: p2,
c: p3,
n1: p4,
n2: p5,
})
}

/// `/A:a1+a2+a3`
fn a(&mut self) -> Result<Option<Mora>, ParseError> {
let a1 = Self::parse_or_xx(self.until("+")?)?;
let a2 = Self::parse_or_xx(self.until("+")?)?;
let a3 = Self::parse_or_xx(self.until("/B:")?)?;

if let (Some(a1), Some(a2), Some(a3)) = (a1, a2, a3) {
Ok(Some(Mora {
relative_accent_position: a1,
position_forward: a2,
position_backward: a3,
}))
} else {
Ok(None)
}
}

/// `/B:b1-b2_b3`
fn b(&mut self) -> Result<Option<Word>, ParseError> {
let b1 = Self::parse_or_xx(self.until("-")?)?;
let b2 = Self::parse_or_xx(self.until("_")?)?;
let b3 = Self::parse_or_xx(self.until("/C:")?)?;

if [b1, b2, b3].iter().all(Option::is_none) {
Ok(None)
} else {
Ok(Some(Word {
pos: b1,
ctype: b2,
cform: b3,
}))
}
}

/// `/C:c1_c2+c3`
fn c(&mut self) -> Result<Option<Word>, ParseError> {
let c1 = Self::parse_or_xx(self.until("_")?)?;
let c2 = Self::parse_or_xx(self.until("+")?)?;
let c3 = Self::parse_or_xx(self.until("/D:")?)?;

if [c1, c2, c3].iter().all(Option::is_none) {
Ok(None)
} else {
Ok(Some(Word {
pos: c1,
ctype: c2,
cform: c3,
}))
}
}

/// `/D:d1+d2_d3`
fn d(&mut self) -> Result<Option<Word>, ParseError> {
let d1 = Self::parse_or_xx(self.until("+")?)?;
let d2 = Self::parse_or_xx(self.until("_")?)?;
let d3 = Self::parse_or_xx(self.until("/E:")?)?;

if [d1, d2, d3].iter().all(Option::is_none) {
Ok(None)
} else {
Ok(Some(Word {
pos: d1,
ctype: d2,
cform: d3,
}))
}
}

/// `/E:e1_e2!e3_e4-e5`
fn e(&mut self) -> Result<Option<AccentPhrasePrevNext>, ParseError> {
let e1 = Self::parse_or_xx(self.until("_")?)?;
let e2 = Self::parse_or_xx(self.until("!")?)?;
let e3 = Self::parse_bool_or_xx(self.until("_")?)?;
Self::assert_xx(self.until("-")?)?;
let e5 = Self::parse_bool_or_xx(self.until("/F:")?)?;

if let (Some(e1), Some(e2), Some(e3)) = (e1, e2, e3) {
Ok(Some(AccentPhrasePrevNext {
mora_count: e1,
accent_position: e2,
is_interrogative: e3,
is_pause_insertion: e5,
}))
} else {
Ok(None)
}
}

/// `/F:f1_f2#_f3_f4@_f5_f6|f7_f8`
fn f(&mut self) -> Result<Option<AccentPhraseCurrent>, ParseError> {
let f1 = Self::parse_or_xx(self.until("_")?)?;
let f2 = Self::parse_or_xx(self.until("#")?)?;
let f3 = Self::parse_bool_or_xx(self.until("_")?)?;
Self::assert_xx(self.until("@")?)?;
let f5 = Self::parse_or_xx(self.until("_")?)?;
let f6 = Self::parse_or_xx(self.until("|")?)?;
let f7 = Self::parse_or_xx(self.until("_")?)?;
let f8 = Self::parse_or_xx(self.until("/G:")?)?;

if let (Some(f1), Some(f2), Some(f3), Some(f5), Some(f6), Some(f7), Some(f8)) =
(f1, f2, f3, f5, f6, f7, f8)
{
Ok(Some(AccentPhraseCurrent {
mora_count: f1,
accent_position: f2,
is_interrogative: f3,
accent_phrase_position_forward: f5,
accent_phrase_position_backward: f6,
mora_position_forward: f7,
mora_position_backward: f8,
}))
} else {
Ok(None)
}
}

/// `/G:g1_g2%g3_g4_g5`
fn g(&mut self) -> Result<Option<AccentPhrasePrevNext>, ParseError> {
let g1 = Self::parse_or_xx(self.until("_")?)?;
let g2 = Self::parse_or_xx(self.until("%")?)?;
let g3 = Self::parse_bool_or_xx(self.until("_")?)?;
Self::assert_xx(self.until("_")?)?;
let g5 = Self::parse_bool_or_xx(self.until("/H:")?)?;

if let (Some(g1), Some(g2), Some(g3)) = (g1, g2, g3) {
Ok(Some(AccentPhrasePrevNext {
mora_count: g1,
accent_position: g2,
is_interrogative: g3,
is_pause_insertion: g5,
}))
} else {
Ok(None)
}
}

/// `/H:h1_h2`
fn h(&mut self) -> Result<Option<BreathGroupPrevNext>, ParseError> {
let h1 = Self::parse_or_xx(self.until("_")?)?;
let h2 = Self::parse_or_xx(self.until("/I:")?)?;

if let (Some(h1), Some(h2)) = (h1, h2) {
Ok(Some(BreathGroupPrevNext {
accent_phrase_count: h1,
mora_count: h2,
}))
} else {
Ok(None)
}
}

/// `/I:i1-i2@i3+i4&i5-i6|i7+i8`
fn i(&mut self) -> Result<Option<BreathGroupCurrent>, ParseError> {
let i1 = Self::parse_or_xx(self.until("-")?)?;
let i2 = Self::parse_or_xx(self.until("@")?)?;
let i3 = Self::parse_or_xx(self.until("+")?)?;
let i4 = Self::parse_or_xx(self.until("&")?)?;
let i5 = Self::parse_or_xx(self.until("-")?)?;
let i6 = Self::parse_or_xx(self.until("|")?)?;
let i7 = Self::parse_or_xx(self.until("+")?)?;
let i8 = Self::parse_or_xx(self.until("/J:")?)?;

if let (Some(i1), Some(i2), Some(i3), Some(i4), Some(i5), Some(i6), Some(i7), Some(i8)) =
(i1, i2, i3, i4, i5, i6, i7, i8)
{
Ok(Some(BreathGroupCurrent {
accent_phrase_count: i1,
mora_count: i2,
breath_group_position_forward: i3,
breath_group_position_backward: i4,
accent_phrase_position_forward: i5,
accent_phrase_position_backward: i6,
mora_position_forward: i7,
mora_position_backward: i8,
}))
} else {
Ok(None)
}
}

/// `/J:j1_j2`
fn j(&mut self) -> Result<Option<BreathGroupPrevNext>, ParseError> {
let j1 = Self::parse_or_xx(self.until("_")?)?;
let j2 = Self::parse_or_xx(self.until("/K:")?)?;

if let (Some(j1), Some(j2)) = (j1, j2) {
Ok(Some(BreathGroupPrevNext {
accent_phrase_count: j1,
mora_count: j2,
}))
} else {
Ok(None)
}
}

/// `/K:k1+k2-k3`
fn k(&mut self) -> Result<Utterance, ParseError> {
let k1 = self.until("+")?.parse()?;
let k2 = self.until("-")?.parse()?;
let k3 = self.input[self.index..].parse()?;

Ok(Utterance {
breath_group_count: k1,
accent_phrase_count: k2,
mora_count: k3,
})
}

fn consume(mut self) -> Result<Label, ParseError> {
Ok(Label {
phoneme: self.p()?,
mora: self.a()?,
word_prev: self.b()?,
word_curr: self.c()?,
word_next: self.d()?,
accent_phrase_prev: self.e()?,
accent_phrase_curr: self.f()?,
accent_phrase_next: self.g()?,
breath_group_prev: self.h()?,
breath_group_curr: self.i()?,
breath_group_next: self.j()?,
utterance: self.k()?,
})
}
}

impl FromStr for Label {
type Err = ParseError;

fn from_str(s: &str) -> Result<Self, Self::Err> {
LabelTokenizer::new(s).consume()
}
}

#[cfg(test)]
mod test;
Loading
Loading