Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add abbreviation support #18

Merged
merged 6 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ Lower Sorbian language component for MaryTTS
[Unreleased]
------------

### Added

- Abbreviation expansion in preprocessing

### Changed

- Upgraded lexicon to v0.2.0
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -46,4 +46,18 @@ class PreprocessIT {
}
assert actual == expected
}

@Test
void 'Given input with abbreviations, When text is converted to words, Then abbreviations are expanded correctly'() {
def input = 'GHz l/km mpg cm³.'
def expected = 'gigahertzow litrow na kilometer milow na galonu kubiknych centimetrow.'
def output = mary.generateXML(input)
def outputStr = output.documentElement.serialize()
def xmlSlurper = new XmlSlurper(false, false)
def tokens = xmlSlurper.parseText(outputStr).depthFirst().findAll { it.name() == 't' }
def actual = tokens.inject('') { result, token ->
(result.isEmpty() || token ==~ /\p{Punct}/) ? result + token : result + ' ' + token
}
assert actual == expected
}
}
44 changes: 43 additions & 1 deletion src/main/java/marytts/language/dsb/Preprocess.java
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
public class Preprocess extends InternalModule {

static final ULocale locale = new ULocale.Builder().setLanguage("dsb").build();
private Map<String, String> abbreviations;
private Map<String, String> symbols;
private RuleBasedNumberFormat ruleBasedNumberFormat;
private NumberFormat numberFormat;
Expand All @@ -39,6 +40,26 @@ public Preprocess() throws MaryConfigurationException {
super("Preprocess", MaryDataType.TOKENS, MaryDataType.WORDS, locale.toLocale());
initNumberExpansion("formatRules.txt");
initSymbolExpansion("symbols.csv");
initAbbreviationExpansion("abbreviations.csv");
}

private void initAbbreviationExpansion(String resourceName) throws MaryConfigurationException {
try {
abbreviations = new HashMap<>();
InputStream abbreviationsStream = this.getClass().getResourceAsStream(resourceName);
InputStreamReader abbreviationsReader = new InputStreamReader(abbreviationsStream, Charsets.UTF_8);
CSVParser csv = CSVFormat.Builder.create(CSVFormat.DEFAULT)
.setHeader("abbreviation", "expansion")
.build()
.parse(abbreviationsReader);
for (CSVRecord record : csv) {
String abbreviation = record.get("abbreviation");
String expansion = record.get("expansion");
abbreviations.put(abbreviation, expansion);
}
} catch (Exception exception) {
throw new MaryConfigurationException(String.format("Could not load abbreviations from %s.%s", this.getClass().getCanonicalName(), resourceName), exception);
}
}

private void initSymbolExpansion(String resourceName) throws MaryConfigurationException {
Expand Down Expand Up @@ -73,13 +94,34 @@ private void initNumberExpansion(String resourceName) throws MaryConfigurationEx

public MaryData process(MaryData d) {
Document doc = d.getDocument();
expandAllAbbreviations(doc);
expandAllSymbols(doc);
expandAllNumbers(doc);
MaryData result = new MaryData(getOutputType(), d.getLocale());
result.setDocument(doc);
return result;
}

private void expandAllAbbreviations(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Element token;
while ((token = (Element) treeWalker.nextNode()) != null) {
String tokenText = MaryDomUtils.tokenText(token);
String expandedAbbreviation = expandAbbreviation(tokenText);
if (expandedAbbreviation != tokenText) {
MaryDomUtils.setTokenText(token, expandedAbbreviation);
}
}
}

protected String expandAbbreviation(String abbreviation) {
if (abbreviations.containsKey(abbreviation))
return abbreviations.get(abbreviation);
else
return abbreviation;
}

private void expandAllSymbols(Document document) {
TreeWalker treeWalker = ((DocumentTraversal) document).createTreeWalker(document, NodeFilter.SHOW_ELEMENT,
new NameNodeFilter(MaryXML.TOKEN), false);
Expand Down Expand Up @@ -126,4 +168,4 @@ protected String spelloutNumber(Number number) {
return ruleBasedNumberFormat.format(number);
}

}
}
129 changes: 129 additions & 0 deletions src/main/resources/marytts/language/dsb/abbreviations.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
"abbreviation","expansion"
"kwart.","kwartal"
"kw.","kwartal"
"mjas.","mjasac"
"tyź.","tyźeń"
"góź.","góźina"
"min.","minutow"
"sek.","sekundow"
"pś.Kr.n.","pśed Kristusowym naroźenim"
"pś.n.l.c.","pśed našym licenim casa"
"pó Kr.n.","pó Kristusowym naroźenim"
"n.l.c.","našogo licenja casa"
"jan.","januar"
"feb.","februar"
"měr.","měrc"
"apr.","apryl"
"jun.","junij"
"jul.","julij"
"awg.","awgust"
"sep.","september"
"okt.","oktober"
"now.","nowember"
"dec.","december"
"dop.","dopołdnja"
"połd.","połdnjo"
"wótp.","wótpołdnja"
"nje.","njeźela"
"pón.","pónjeźele"
"wał.","wałtora"
"srj.","srjoda"
"stw.","stwórtk"
"pě.","pětk"
"sob.","sobota"
"nj.","njeźela"
"pó.","pónjeźele"
"sr.","wu."
"st.","stwórtk"
"tys.","tysac"
"mil.","milionow"
"mrd.","miliardow"
"bil.","bilionow"
"mio.","milionow"
"PLN","pólskich złotych"
"zł","pólskich złotych"
"Kč","českich kronow"
"EUR","eurow"
"DKK","dańskich kronow"
"DM","markow"
"CHF","šwicaŕskich frankow"
"CZK","českich krónow"
"HUF","hungorskich forintow"
"PLZ","pej el cet"
"RUB","rusojskich rublow"
"CNY","chinskich yuanow"
"CN¥","chinskich yuanow"
"JPY","japańskich yenow"
"AUD","awstralskich dolarow"
"NZ$","nowoseelandskich dolarow"
"dn.","dnjow"
"dn.","dnjow"
"cm","centimetrow"
"dm","decimetrow"
"ft","stopow"
"km","kilometrow"
"m","metrow"
"μm","mikrometrow"
"mm","milimetrow"
"nm","nanometrow"
"nmi","nawtiskich milow"
"cm²"," kwadratnych centimetrow"
"ft²","kwadratnych stopow"
"in²","kwadratnych cólow"
"km²","kwadratnych kilometrow"
"m²","kwadratnych metrow"
"mi²","kwadratnych milow"
"yd²","kwadratnych yardow"
"cm³","kubiknych centimetrow"
"ft³","kubiknych stopow"
"in³","kubiknych cólow"
"km³","kubiknych kilometrow"
"m³","kubiknych metrow"
"mi³","kubiknych milow"
"yd³","kubiknych yardow"
"fl. oz.","žydkich uncow"
"łž.","łžycow"
"łžk.","łžyckow"
"m/s²","metrow na kwadratnu sekundu"
"km/h","kilometrow na góźinu"
"m/s","metrow na sekundu"
"mph","milow na góźinu"
"kg","kilogramow"
"µg","mikrogramow"
"mg","miligramow"
"oz","uncow"
"oz. tr.","trojskich uncow"
"lb","puntow"
"cal","kalorijow"
"kcal","kilokalorijow"
"kJ","kilodžulow"
"kWh","kilowattowych góźin"
"GW","gigawattow"
"PS","kónjecych mócow"
"kW","kilowattow"
"MW","megawattow"
"mW","miliwattow"
"mA","milliamperow"
"Ω","ohmow"
"GHz","gigahertzow"
"Hz","hertzow"
"kHz","kilohertzow"
"MHz","megahertzow"
"hPa","hektopascalow"
"inHg","cólow žywoslobrowego stołpika"
"mbar","milibarow"
"mm Hg","milimetrow žywoslobrowego stołpika"
"°C","stopnjow Celsiusa"
"°F","stopnjow Fahrenheita"
"bit","bitow"
"byte","byteow"
"Gb","gigabitow"
"GB","gigabyteow"
"kb","kilobitow"
"kB","kilobyteow"
"Mb","megabitow"
"MB","megabyteow"
"Tb","terabitow"
"TB","terabyteow"
"l/km","litrow na kilometer"
"mpg","milow na galonu"
5 changes: 0 additions & 5 deletions src/main/resources/marytts/language/dsb/symbols.csv
Original file line number Diff line number Diff line change
Expand Up @@ -77,13 +77,8 @@ $,dolarow
=,rowna se
>,wětše ako
@,et
[,wótwórjona rožkata spinka
\,beksleš
],zawrjena rožkata spinka
^,wušej
{,wótwórjona wuzgibnjona spinka
|,padorowna smužka
},zawrjena wuzgibnjona spinka
~,tilda
°,stopnjow
"°C",stopnjow Celsius
Expand Down
Loading