Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
tdaneyko committed Dec 13, 2019
1 parent c000fa1 commit 8b69565
Show file tree
Hide file tree
Showing 18 changed files with 2,791 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.idea/
*.iml
target/
src/test/
Binary file added docs/jfst_binary.pdf
Binary file not shown.
61 changes: 61 additions & 0 deletions pom.xml
Original file line number Diff line number Diff line change
@@ -0,0 +1,61 @@
<?xml version="1.0" encoding="UTF-8"?>
<project xmlns="http://maven.apache.org/POM/4.0.0"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://maven.apache.org/POM/4.0.0 http://maven.apache.org/xsd/maven-4.0.0.xsd">
<modelVersion>4.0.0</modelVersion>

<groupId>de.tuebingen.sfs</groupId>
<artifactId>jfst</artifactId>
<version>1.0-SNAPSHOT</version>

<resources>
<resource>
<directory>src/test/resources</directory>
</resource>
</resources>

<repositories>
<repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
</repository>
</repositories>

<dependencies>
<!-- Own dependencies -->
<dependency>
<groupId>com.github.tdaneyko</groupId>
<artifactId>string-utils</artifactId>
<version>v0.9</version>
</dependency>
<dependency>
<groupId>com.github.tdaneyko</groupId>
<artifactId>bin-utils</artifactId>
<version>v0.9</version>
</dependency>
<!-- Foreign dependencies -->
<dependency>
<groupId>net.sf.trove4j</groupId>
<artifactId>trove4j</artifactId>
<version>3.0.3</version>
</dependency>
<dependency>
<groupId>junit</groupId>
<artifactId>junit</artifactId>
<version>4.11</version>
<scope>test</scope>
</dependency>
</dependencies>
<build>
<plugins>
<plugin>
<groupId>org.apache.maven.plugins</groupId>
<artifactId>maven-compiler-plugin</artifactId>
<configuration>
<source>1.8</source>
<target>1.8</target>
</configuration>
</plugin>
</plugins>
</build>
</project>
198 changes: 198 additions & 0 deletions src/main/java/de/tuebingen/sfs/jfst/Alphabet.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,198 @@
package de.tuebingen.sfs.jfst;

import java.util.*;

/**
* The collection of symbols used by an FST.
*
* This alphabet stores a single object for each unique char or string that occurs
* in an FST. The FST can retrieve references to these wrapper objects for its transitions
* and reuse them wherever the symbol occurs to save space. (I.e. each unique string or
* char is stored only once instead of each time when it occurs in a transition.)
*/
public class Alphabet {

// Symbols in alphabetic order
private final List<Symbol> alphabet;
// Symbols in the order of their ids
private final List<Symbol> id2sym;

/**
* Create an empty Alphabet.
*/
Alphabet() {
alphabet = new ArrayList<>();
id2sym = new ArrayList<>();
}

/**
* Create an Alphabet with symbols.
*/
Alphabet(String[] symbols) {
this();
addSymbols(symbols);
}

/**
* Create an Alphabet with symbols.
*/
Alphabet(Iterable<String> symbols) {
this();
addSymbols(symbols);
}

/**
* Checks whether this Alphabet contains a symbol.
* @param symbol A char symbol
* @return True if this Alphabet already contains the symbol
*/
boolean contains(char symbol) {
return Collections.binarySearch(alphabet, symbol) >= 0;
}

/**
* Checks whether this Alphabet contains a symbol.
* @param symbol A string symbol
* @return True if this Alphabet already contains the symbol
*/
boolean contains(String symbol) {
return Collections.binarySearch(alphabet, symbol) >= 0;
}

/**
* Get the Symbol object associated with this string. If the Alphabet does not
* contain the symbol yet, it will be added.
* @param symbol A string symbol
* @return The Symbol object associated with this string
*/
Symbol getSymbol(String symbol) {
int i = Collections.binarySearch(alphabet, symbol);
if (i < 0) {
i = -(i+1);
return addSymbol(symbol, i);
}
else
return alphabet.get(i);
}

/**
* Get the symbol with this id.
* @param id An id
* @return The symbol with this id or null if there is no such symbol
*/
Symbol getSymbol(int id) {
if (id < id2sym.size())
return id2sym.get(id);
else
return null;
}

void setSymbol(int id, String symbol) {
if (id >= id2sym.size())
addSymbol(symbol);
else {
Symbol sym = id2sym.get(id);
int i = Collections.binarySearch(alphabet, sym);
Symbol newSym = createSymbol(symbol, id);
id2sym.set(id, newSym);
alphabet.set(i, newSym);
}
}

List<Symbol> getPrefixes(String s, int start) {
List<Symbol> prefixes = new ArrayList<>();
char c = s.charAt(start);
// int i = Collections.binarySearch(alphabet, c);
// if (i < 0)
// i = -i - 1;
int i = 0;
while (i < alphabet.size() && !alphabet.get(i).startsWith(c)) {
i++;
}
for (int j = i; j < alphabet.size() && alphabet.get(j).startsWith(c); j++) {
Symbol sym = alphabet.get(j);
if (sym.prefixOf(s, start))
prefixes.add(sym);
}
return prefixes;
}

/**
* Add a symbol to the alphabet. Calls getSymbol() internally.
* @param symbol A symbol
*/
void addSymbol(String symbol) {
getSymbol(symbol);
}

/**
* Add multiple symbols to the alphabet. Calls getSymbol() internally.
* @param symbols An array of symbols
*/
void addSymbols(String[] symbols) {
int i = 0;
for (String sym : symbols) {
if (sym == null)
addSymbol("NULL"+(i++));
else
addSymbol(sym);
}
}

/**
* Add multiple symbols to the alphabet. Calls getSymbol() internally.
* @param symbols A list of symbols
*/
void addSymbols(Iterable<String> symbols) {
for (String sym : symbols) {
if (sym == null)
addSymbol("NULL");
else
addSymbol(sym);
}
}

/**
* Insert a symbol at index i.
* @param symbol A symbol
* @param i An index in alphabet
* @return The created Symbol object
*/
private Symbol addSymbol(String symbol, int i) {
int id = id2sym.size();
Symbol s = createSymbol(symbol, id);
alphabet.add(i, s);
id2sym.add(s);
return s;
}

private Symbol createSymbol(String symbol, int id) {
return (symbol.length() == 1) ? new CharSymbol(symbol.charAt(0), id) : new MulticharSymbol(symbol, id);
}

/**
* Get the string representations of all symbols in this alphabet as an array, ordered according to their id.
* @return An array with all symols in this alphabet
*/
String[] getSymbols() {
return id2sym.stream().map(Symbol::asString).toArray(String[]::new);
}

/**
* Find the id of that symbol.
* @param symbol A string symbol
* @return The id of the symbol in this alphabet, or -1 if it is not contained
*/
int idOf(String symbol) {
return (contains(symbol)) ? getSymbol(symbol).getId() : -1;
}

/**
* Get the total number of symbols in this alphabet.
* @return The size of this alphabet
*/
int size() {
return alphabet.size();
}

}
119 changes: 119 additions & 0 deletions src/main/java/de/tuebingen/sfs/jfst/BinaryFSTWriter.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,119 @@
package de.tuebingen.sfs.jfst;

import de.tuebingen.sfs.util.bin.IOUtils;
import gnu.trove.list.TIntList;
import gnu.trove.list.array.TIntArrayList;

import java.io.IOException;
import java.io.OutputStream;

/**
* A class for writing a JFST object to a binary file.
*/
public class BinaryFSTWriter {

/**
* Control byte that marks the end of literal transitions of an accepting state
*/
public static final byte ACCEPTING = (byte) 0b10000001;
/**
* Control byte that marks the end of literal transitions of a non-accepting state
*/
public static final byte NONACCEPTING = (byte) 0b10000000;
/**
* Control byte that marks the end of a state
*/
public static final byte STATEEND = (byte) 0b11111111;

/**
* Write an FST to a binary file.
* @param out Output file
* @param fst The FST
* @throws IOException
*/
public static void writeFST(OutputStream out, FST fst) throws IOException {
writeFST(out, fst.iter(), fst.getSymbols());
}

/**
* Write an FST to a binary file.
* @param out Output file
* @param states Ierator over states and transitions of an FST
* @param alphabet Symbols used by that FST
* @throws IOException
*/
public static void writeFST(OutputStream out, FSTStateIterator states, String[] alphabet) throws IOException {
int startID = states.getStartState();
int nStates = states.nOfStates();
int nTrans = states.nOfTransitions();
int nSyms = alphabet.length;
int a = IOUtils.bytesNeededFor(nSyms-1); // Symbol id size
int s = IOUtils.bytesNeededFor(nStates-1); // State id size

// Write alphabet to file
for (String sym : alphabet) {
IOUtils.writeAsBytes(sym, out);
}
// Write extra newline to mark end of alphabet
IOUtils.writeNewline(out);

// Write number of states
IOUtils.writeInt(nStates, out);

// Write start id
IOUtils.writeIntTruncated(startID, s, out);

// Write number of transitions
IOUtils.writeInt(nTrans, out);

// Write transitions
while (states.hasNextState()) {
states.nextState();
// Store identity transitions to write them later
TIntList identityTransitions = new TIntArrayList();

while (states.hasNextTransition()) {
states.nextTransition();
// Save identity transitions for later
if (states.identity())
identityTransitions.add(states.toId());
// Write literal transition
else {
int toId = states.toId();
int inSym = states.inId();
int outSym = states.outId();
int k = s + a + a - 1;
// Convert transition into byte array
byte[] transBytes = new byte[k + 1];
while (k >= s + a) {
transBytes[k] = (byte) outSym;
outSym = outSym >> 8;
k--;
}
while (k >= s) {
transBytes[k] = (byte) inSym;
inSym = inSym >> 8;
k--;
}
while (k >= 0) {
transBytes[k] = (byte) toId;
toId = toId >> 8;
k--;
}
out.write(transBytes);
}
}
// Write accepting/non-accepting
out.write((states.accepting()) ? ACCEPTING : NONACCEPTING);

// Write identity transitions
for (int i = 0; i < identityTransitions.size(); i++) {
int toId = identityTransitions.get(i);
IOUtils.writeIntTruncated(toId, s, out);
}
// Write end of state
out.write(STATEEND);
}
}

}
Loading

0 comments on commit 8b69565

Please sign in to comment.