-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Showing
18 changed files
with
2,791 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.idea/ | ||
*.iml | ||
target/ | ||
src/test/ |
Binary file not shown.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,61 @@ | ||
<?xml version="1.0" encoding="UTF-8"?> | ||
<project xmlns="http://maven.apache.org/POM/4.0.0" | ||
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" | ||
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd"> | ||
<modelVersion>4.0.0</modelVersion> | ||
|
||
<groupId>de.tuebingen.sfs</groupId> | ||
<artifactId>jfst</artifactId> | ||
<version>1.0-SNAPSHOT</version> | ||
|
||
<resources> | ||
<resource> | ||
<directory>src/test/resources</directory> | ||
</resource> | ||
</resources> | ||
|
||
<repositories> | ||
<repository> | ||
<id>jitpack.io</id> | ||
<url>https://jitpack.io</url> | ||
</repository> | ||
</repositories> | ||
|
||
<dependencies> | ||
<!-- Own dependencies --> | ||
<dependency> | ||
<groupId>com.github.tdaneyko</groupId> | ||
<artifactId>string-utils</artifactId> | ||
<version>v0.9</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>com.github.tdaneyko</groupId> | ||
<artifactId>bin-utils</artifactId> | ||
<version>v0.9</version> | ||
</dependency> | ||
<!-- Foreign dependencies --> | ||
<dependency> | ||
<groupId>net.sf.trove4j</groupId> | ||
<artifactId>trove4j</artifactId> | ||
<version>3.0.3</version> | ||
</dependency> | ||
<dependency> | ||
<groupId>junit</groupId> | ||
<artifactId>junit</artifactId> | ||
<version>4.11</version> | ||
<scope>test</scope> | ||
</dependency> | ||
</dependencies> | ||
<build> | ||
<plugins> | ||
<plugin> | ||
<groupId>org.apache.maven.plugins</groupId> | ||
<artifactId>maven-compiler-plugin</artifactId> | ||
<configuration> | ||
<source>1.8</source> | ||
<target>1.8</target> | ||
</configuration> | ||
</plugin> | ||
</plugins> | ||
</build> | ||
</project> |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,198 @@ | ||
package de.tuebingen.sfs.jfst; | ||
|
||
import java.util.*; | ||
|
||
/** | ||
* The collection of symbols used by an FST. | ||
* | ||
* This alphabet stores a single object for each unique char or string that occurs | ||
* in an FST. The FST can retrieve references to these wrapper objects for its transitions | ||
* and reuse them wherever the symbol occurs to save space. (I.e. each unique string or | ||
* char is stored only once instead of each time when it occurs in a transition.) | ||
*/ | ||
public class Alphabet { | ||
|
||
// Symbols in alphabetic order | ||
private final List<Symbol> alphabet; | ||
// Symbols in the order of their ids | ||
private final List<Symbol> id2sym; | ||
|
||
/** | ||
* Create an empty Alphabet. | ||
*/ | ||
Alphabet() { | ||
alphabet = new ArrayList<>(); | ||
id2sym = new ArrayList<>(); | ||
} | ||
|
||
/** | ||
* Create an Alphabet with symbols. | ||
*/ | ||
Alphabet(String[] symbols) { | ||
this(); | ||
addSymbols(symbols); | ||
} | ||
|
||
/** | ||
* Create an Alphabet with symbols. | ||
*/ | ||
Alphabet(Iterable<String> symbols) { | ||
this(); | ||
addSymbols(symbols); | ||
} | ||
|
||
/** | ||
* Checks whether this Alphabet contains a symbol. | ||
* @param symbol A char symbol | ||
* @return True if this Alphabet already contains the symbol | ||
*/ | ||
boolean contains(char symbol) { | ||
return Collections.binarySearch(alphabet, symbol) >= 0; | ||
} | ||
|
||
/** | ||
* Checks whether this Alphabet contains a symbol. | ||
* @param symbol A string symbol | ||
* @return True if this Alphabet already contains the symbol | ||
*/ | ||
boolean contains(String symbol) { | ||
return Collections.binarySearch(alphabet, symbol) >= 0; | ||
} | ||
|
||
/** | ||
* Get the Symbol object associated with this string. If the Alphabet does not | ||
* contain the symbol yet, it will be added. | ||
* @param symbol A string symbol | ||
* @return The Symbol object associated with this string | ||
*/ | ||
Symbol getSymbol(String symbol) { | ||
int i = Collections.binarySearch(alphabet, symbol); | ||
if (i < 0) { | ||
i = -(i+1); | ||
return addSymbol(symbol, i); | ||
} | ||
else | ||
return alphabet.get(i); | ||
} | ||
|
||
/** | ||
* Get the symbol with this id. | ||
* @param id An id | ||
* @return The symbol with this id or null if there is no such symbol | ||
*/ | ||
Symbol getSymbol(int id) { | ||
if (id < id2sym.size()) | ||
return id2sym.get(id); | ||
else | ||
return null; | ||
} | ||
|
||
void setSymbol(int id, String symbol) { | ||
if (id >= id2sym.size()) | ||
addSymbol(symbol); | ||
else { | ||
Symbol sym = id2sym.get(id); | ||
int i = Collections.binarySearch(alphabet, sym); | ||
Symbol newSym = createSymbol(symbol, id); | ||
id2sym.set(id, newSym); | ||
alphabet.set(i, newSym); | ||
} | ||
} | ||
|
||
List<Symbol> getPrefixes(String s, int start) { | ||
List<Symbol> prefixes = new ArrayList<>(); | ||
char c = s.charAt(start); | ||
// int i = Collections.binarySearch(alphabet, c); | ||
// if (i < 0) | ||
// i = -i - 1; | ||
int i = 0; | ||
while (i < alphabet.size() && !alphabet.get(i).startsWith(c)) { | ||
i++; | ||
} | ||
for (int j = i; j < alphabet.size() && alphabet.get(j).startsWith(c); j++) { | ||
Symbol sym = alphabet.get(j); | ||
if (sym.prefixOf(s, start)) | ||
prefixes.add(sym); | ||
} | ||
return prefixes; | ||
} | ||
|
||
/** | ||
* Add a symbol to the alphabet. Calls getSymbol() internally. | ||
* @param symbol A symbol | ||
*/ | ||
void addSymbol(String symbol) { | ||
getSymbol(symbol); | ||
} | ||
|
||
/** | ||
* Add multiple symbols to the alphabet. Calls getSymbol() internally. | ||
* @param symbols An array of symbols | ||
*/ | ||
void addSymbols(String[] symbols) { | ||
int i = 0; | ||
for (String sym : symbols) { | ||
if (sym == null) | ||
addSymbol("NULL"+(i++)); | ||
else | ||
addSymbol(sym); | ||
} | ||
} | ||
|
||
/** | ||
* Add multiple symbols to the alphabet. Calls getSymbol() internally. | ||
* @param symbols A list of symbols | ||
*/ | ||
void addSymbols(Iterable<String> symbols) { | ||
for (String sym : symbols) { | ||
if (sym == null) | ||
addSymbol("NULL"); | ||
else | ||
addSymbol(sym); | ||
} | ||
} | ||
|
||
/** | ||
* Insert a symbol at index i. | ||
* @param symbol A symbol | ||
* @param i An index in alphabet | ||
* @return The created Symbol object | ||
*/ | ||
private Symbol addSymbol(String symbol, int i) { | ||
int id = id2sym.size(); | ||
Symbol s = createSymbol(symbol, id); | ||
alphabet.add(i, s); | ||
id2sym.add(s); | ||
return s; | ||
} | ||
|
||
private Symbol createSymbol(String symbol, int id) { | ||
return (symbol.length() == 1) ? new CharSymbol(symbol.charAt(0), id) : new MulticharSymbol(symbol, id); | ||
} | ||
|
||
/** | ||
* Get the string representations of all symbols in this alphabet as an array, ordered according to their id. | ||
* @return An array with all symols in this alphabet | ||
*/ | ||
String[] getSymbols() { | ||
return id2sym.stream().map(Symbol::asString).toArray(String[]::new); | ||
} | ||
|
||
/** | ||
* Find the id of that symbol. | ||
* @param symbol A string symbol | ||
* @return The id of the symbol in this alphabet, or -1 if it is not contained | ||
*/ | ||
int idOf(String symbol) { | ||
return (contains(symbol)) ? getSymbol(symbol).getId() : -1; | ||
} | ||
|
||
/** | ||
* Get the total number of symbols in this alphabet. | ||
* @return The size of this alphabet | ||
*/ | ||
int size() { | ||
return alphabet.size(); | ||
} | ||
|
||
} |
119 changes: 119 additions & 0 deletions
119
src/main/java/de/tuebingen/sfs/jfst/BinaryFSTWriter.java
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,119 @@ | ||
package de.tuebingen.sfs.jfst; | ||
|
||
import de.tuebingen.sfs.util.bin.IOUtils; | ||
import gnu.trove.list.TIntList; | ||
import gnu.trove.list.array.TIntArrayList; | ||
|
||
import java.io.IOException; | ||
import java.io.OutputStream; | ||
|
||
/** | ||
* A class for writing a JFST object to a binary file. | ||
*/ | ||
public class BinaryFSTWriter { | ||
|
||
/** | ||
* Control byte that marks the end of literal transitions of an accepting state | ||
*/ | ||
public static final byte ACCEPTING = (byte) 0b10000001; | ||
/** | ||
* Control byte that marks the end of literal transitions of a non-accepting state | ||
*/ | ||
public static final byte NONACCEPTING = (byte) 0b10000000; | ||
/** | ||
* Control byte that marks the end of a state | ||
*/ | ||
public static final byte STATEEND = (byte) 0b11111111; | ||
|
||
/** | ||
* Write an FST to a binary file. | ||
* @param out Output file | ||
* @param fst The FST | ||
* @throws IOException | ||
*/ | ||
public static void writeFST(OutputStream out, FST fst) throws IOException { | ||
writeFST(out, fst.iter(), fst.getSymbols()); | ||
} | ||
|
||
/** | ||
* Write an FST to a binary file. | ||
* @param out Output file | ||
* @param states Ierator over states and transitions of an FST | ||
* @param alphabet Symbols used by that FST | ||
* @throws IOException | ||
*/ | ||
public static void writeFST(OutputStream out, FSTStateIterator states, String[] alphabet) throws IOException { | ||
int startID = states.getStartState(); | ||
int nStates = states.nOfStates(); | ||
int nTrans = states.nOfTransitions(); | ||
int nSyms = alphabet.length; | ||
int a = IOUtils.bytesNeededFor(nSyms-1); // Symbol id size | ||
int s = IOUtils.bytesNeededFor(nStates-1); // State id size | ||
|
||
// Write alphabet to file | ||
for (String sym : alphabet) { | ||
IOUtils.writeAsBytes(sym, out); | ||
} | ||
// Write extra newline to mark end of alphabet | ||
IOUtils.writeNewline(out); | ||
|
||
// Write number of states | ||
IOUtils.writeInt(nStates, out); | ||
|
||
// Write start id | ||
IOUtils.writeIntTruncated(startID, s, out); | ||
|
||
// Write number of transitions | ||
IOUtils.writeInt(nTrans, out); | ||
|
||
// Write transitions | ||
while (states.hasNextState()) { | ||
states.nextState(); | ||
// Store identity transitions to write them later | ||
TIntList identityTransitions = new TIntArrayList(); | ||
|
||
while (states.hasNextTransition()) { | ||
states.nextTransition(); | ||
// Save identity transitions for later | ||
if (states.identity()) | ||
identityTransitions.add(states.toId()); | ||
// Write literal transition | ||
else { | ||
int toId = states.toId(); | ||
int inSym = states.inId(); | ||
int outSym = states.outId(); | ||
int k = s + a + a - 1; | ||
// Convert transition into byte array | ||
byte[] transBytes = new byte[k + 1]; | ||
while (k >= s + a) { | ||
transBytes[k] = (byte) outSym; | ||
outSym = outSym >> 8; | ||
k--; | ||
} | ||
while (k >= s) { | ||
transBytes[k] = (byte) inSym; | ||
inSym = inSym >> 8; | ||
k--; | ||
} | ||
while (k >= 0) { | ||
transBytes[k] = (byte) toId; | ||
toId = toId >> 8; | ||
k--; | ||
} | ||
out.write(transBytes); | ||
} | ||
} | ||
// Write accepting/non-accepting | ||
out.write((states.accepting()) ? ACCEPTING : NONACCEPTING); | ||
|
||
// Write identity transitions | ||
for (int i = 0; i < identityTransitions.size(); i++) { | ||
int toId = identityTransitions.get(i); | ||
IOUtils.writeIntTruncated(toId, s, out); | ||
} | ||
// Write end of state | ||
out.write(STATEEND); | ||
} | ||
} | ||
|
||
} |
Oops, something went wrong.