Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

This fixes Classical and Weighted Levenshtein distances. #269

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
*/
public class Levenshtein implements Comparator {

public double compare(String s1, String s2) {
public double compare(String s1, String s2) {
int len = Math.min(s1.length(), s2.length());

// we know that if the outcome here is 0.5 or lower, then the
Expand All @@ -34,7 +34,7 @@ public double compare(String s1, String s2) {
// if the strings are equal we can stop right here.
if (len == maxlen && s1.equals(s2))
return 1.0;

// we couldn't shortcut, so now we go ahead and compute the full
// metric
int dist = Math.min(compactDistance(s1, s2), len);
Expand All @@ -51,18 +51,20 @@ public boolean isTokenized() {
* speed, but still computes the entire matrix.
*/
public static int distance(String s1, String s2) {
if (s1.length() == 0)
return s2.length();
if (s2.length() == 0)
return s1.length();

int s1len = s1.length();
int s2len = s2.length();
if (s1len == 0)
return s2len;
if (s2len == 0)
return s1len;


// we use a flat array for better performance. we address it by
// s1ix + s1len * s2ix. this modification improves performance
// by about 30%, which is definitely worth the extra complexity.
int[] matrix = new int[(s1len + 1) * (s2.length() + 1)];
for (int col = 0; col <= s2.length(); col++)
matrix[col * s1len] = col;
int[] matrix = new int[(s1len + 1) * (s2len + 1)];
for (int col = 0; col <= s2len; col++)
matrix[col * (s1len + 1)] = col;
for (int row = 0; row <= s1len; row++)
matrix[row] = row;

Expand All @@ -75,11 +77,11 @@ public static int distance(String s1, String s2) {
else
cost = 1;

int left = matrix[ix1 + ((ix2 + 1) * s1len)] + 1;
int above = matrix[ix1 + 1 + (ix2 * s1len)] + 1;
int aboveleft = matrix[ix1 + (ix2 * s1len)] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * s1len)] =
Math.min(left, Math.min(above, aboveleft));
int left = matrix[ix1 + ((ix2 + 1) * (s1len + 1))] + 1;
int above = matrix[ix1 + 1 + (ix2 * (s1len + 1))] + 1;
int aboveleft = matrix[ix1 + (ix2 * (s1len + 1))] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * (s1len + 1))] =
Math.min(left, Math.min(above, aboveleft));
}
}

Expand All @@ -89,10 +91,10 @@ public static int distance(String s1, String s2) {
// }
// System.out.println();
// }
return matrix[s1len + (s2.length() * s1len)];

return matrix[(s1len + 1) * (s2.length() + 1)-1];
}

// /**
// * An optimized version of the Wagner & Fischer algorithm, which
// * exploits our knowledge that if the distance is above a certain
Expand Down Expand Up @@ -138,7 +140,7 @@ public static int distance(String s1, String s2) {
// matrix[ix1 + 1 + ((ix2 + 1) * s1len)] = distance;
// }
// }

// return matrix[s1len + (s2.length() * s1len)];
// }

Expand All @@ -163,7 +165,7 @@ public static int distance(String s1, String s2) {
// // FIXME: modify to avoid having to initialize
// for (int ix = 1; ix < matrix.length; ix++)
// matrix[ix] = -1;

// return computeRecursively(matrix, s1, s2, s1.length(), s2.length());
// }

Expand Down Expand Up @@ -213,7 +215,7 @@ public static int distance(String s1, String s2) {
// else
// // it' can't be smaller than above, so no need to compute
// left = above;

// int distance = Math.min(left, Math.min(above, aboveleft)) + cost;
// matrix[pos] = distance;
// return distance;
Expand All @@ -233,7 +235,7 @@ public static int compactDistance(String s1, String s2) {

// the maximum edit distance there is any point in reporting.
int maxdist = Math.min(s1.length(), s2.length()) / 2;

// we allocate just one column instead of the entire matrix, in
// order to save space. this also enables us to implement the
// algorithm somewhat faster. the first cell is always the
Expand Down Expand Up @@ -271,7 +273,7 @@ public static int compactDistance(String s1, String s2) {
// aboveleft: column[ix1 - 1]
// left: column[ix1]
int value = Math.min(Math.min(above, column[ix1 - 1]), column[ix1]) +
cost;
cost;
column[ix1 - 1] = above; // write previous
above = value; // keep current
smallest = Math.min(smallest, value);
Expand All @@ -285,5 +287,5 @@ public static int compactDistance(String s1, String s2) {

// ok, we're done
return above;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,37 +46,38 @@ public WeightEstimator getEstimator() {

public static double distance(String s1, String s2, WeightEstimator weight) {
int s1len = s1.length();
int s2len = s2.length();
if (s1len == 0)
return estimateCharacters(s2, weight);
if (s2.length() == 0)
if (s2len == 0)
return estimateCharacters(s1, weight);

// we use a flat array for better performance. we address it by
// s1ix + s1len * s2ix. this modification improves performance
// by about 30%, which is definitely worth the extra complexity.
double[] matrix = new double[(s1len + 1) * (s2.length() + 1)];
for (int col = 0; col <= s2.length(); col++)
matrix[col * s1len] = col;
double[] matrix = new double[(s1len + 1) * (s2len + 1)];
for (int col = 0; col <= s2len; col++)
matrix[col * (s1len + 1)] = col;
for (int row = 0; row <= s1len; row++)
matrix[row] = row;

for (int ix1 = 0; ix1 < s1len; ix1++) {
char ch1 = s1.charAt(ix1);
for (int ix2 = 0; ix2 < s2.length(); ix2++) {
for (int ix2 = 0; ix2 < s2len; ix2++) {
double cost;
char ch2 = s2.charAt(ix2);
if (ch1 == ch2)
cost = 0;
else
cost = weight.substitute(ix1, ch1, s2.charAt(ix2));

double left = matrix[ix1 + ((ix2 + 1) * s1len)] +
weight.delete(ix1, ch1);
double above = matrix[ix1 + 1 + (ix2 * s1len)] +
weight.insert(ix1, ch2);
double aboveleft = matrix[ix1 + (ix2 * s1len)] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * s1len)] =
Math.min(left, Math.min(above, aboveleft));
double left = matrix[ix1 + ((ix2 + 1) * (s1len + 1))] +
weight.delete(ix1, ch1);
double above = matrix[ix1 + 1 + (ix2 * (s1len + 1))] +
weight.insert(ix1, ch2);
double aboveleft = matrix[ix1 + (ix2 * (s1len + 1))] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * (s1len + 1))] =
Math.min(left, Math.min(above, aboveleft));
}
}

Expand All @@ -87,7 +88,7 @@ public static double distance(String s1, String s2, WeightEstimator weight) {
// System.out.println();
// }

return matrix[s1len + (s2.length() * s1len)];
return matrix[(s1len +1) * (s2len + 1) - 1];
}

// /**
Expand Down Expand Up @@ -249,7 +250,7 @@ else if (Character.isDigit(ch))
int type = Character.getType(ch);
// 20, 21, 22, 23, 24, 25, 26, 27
if (Character.isSpace(ch) ||
(type >= 20 && type <= 27))
(type >= 20 && type <= 27))
weight = punctuation;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public class LevenshteinTest {
public void setup() {
this.comp = new Levenshtein();
}

// tests for the comparator

@Test
Expand All @@ -31,9 +31,9 @@ public void testComparatorTotallyDifferent() {
public void testComparatorOneInFour() {
assertEquals(0.75, comp.compare("fooz", "foos"));
}

// tests for the original algorithm

@Test
public void testEmpty() {
assertEquals(0, Levenshtein.distance("", ""));
Expand All @@ -60,13 +60,19 @@ public void testDays() {
assertEquals(3, Levenshtein.distance("saturday", "sunday"));
assertEquals(3, Levenshtein.distance("sunday", "saturday"));
}

@Test
public void testGambol() {
assertEquals(2, Levenshtein.distance("gambol", "gumbo"));
assertEquals(2, Levenshtein.distance("gumbo", "gambol"));
}

@Test
public void testAbc() {
assertEquals(2, Levenshtein.distance("a", "abc"));
assertEquals(2, Levenshtein.distance("abc", "a"));
}

@Test
public void testTotallyUnlike() {
assertEquals(4, Levenshtein.distance("abcd", "efgh"));
Expand Down Expand Up @@ -100,7 +106,7 @@ public void testCDays() {
assertEquals(3, Levenshtein.compactDistance("saturday", "sunday"));
assertEquals(3, Levenshtein.compactDistance("sunday", "saturday"));
}

@Test
public void testCGambol() {
assertEquals(2, Levenshtein.compactDistance("gambol", "gumbo"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public class WeightedLevenshteinTest {
public void setup() {
e = new WeightedLevenshtein.DefaultWeightEstimator();
}

@Test
public void testEmpty() {
assertEquals(0.0, WeightedLevenshtein.distance("", "", e));
Expand Down Expand Up @@ -43,6 +43,26 @@ public void testSubstitute2() {
assertEquals(3.0, WeightedLevenshtein.distance("totanic 1", "titanic 2", e));
}

@Test
public void testAbc() {
assertEquals(2.0, WeightedLevenshtein.distance("abc", "a", e));
assertEquals(2.0, WeightedLevenshtein.distance("a", "abc", e));
}

@Test
public void test123() {
e.setDigitWeight(2.0);
assertEquals(4.0, WeightedLevenshtein.distance("1", "123", e));
assertEquals(4.0, WeightedLevenshtein.distance("123", "1", e));
}

@Test
public void testAlphaNumeric() {
e.setDigitWeight(2.0);
assertEquals(8.0, WeightedLevenshtein.distance("a2c3e", "1b1d1", e));
assertEquals(8.0, WeightedLevenshtein.distance("1b1d1", "a2c3e", e));
}

@Test
public void testComparator() {
WeightedLevenshtein comp = new WeightedLevenshtein();
Expand Down