Skip to content

Commit

Permalink
Updated the implementation of Weighted Levenshtein distance and Leven…
Browse files Browse the repository at this point in the history
…shtein distance. Added new test cases that spotted the bug in the implementation. This fixes larsga#268, along with larsga#239 and larsga#244.
  • Loading branch information
ibuda committed Apr 7, 2019
1 parent 6263f9c commit 40d0c99
Show file tree
Hide file tree
Showing 4 changed files with 73 additions and 44 deletions.
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
*/
public class Levenshtein implements Comparator {

public double compare(String s1, String s2) {
public double compare(String s1, String s2) {
int len = Math.min(s1.length(), s2.length());

// we know that if the outcome here is 0.5 or lower, then the
Expand All @@ -34,7 +34,7 @@ public double compare(String s1, String s2) {
// if the strings are equal we can stop right here.
if (len == maxlen && s1.equals(s2))
return 1.0;

// we couldn't shortcut, so now we go ahead and compute the full
// metric
int dist = Math.min(compactDistance(s1, s2), len);
Expand All @@ -51,18 +51,20 @@ public boolean isTokenized() {
* speed, but still computes the entire matrix.
*/
public static int distance(String s1, String s2) {
if (s1.length() == 0)
return s2.length();
if (s2.length() == 0)
return s1.length();

int s1len = s1.length();
int s2len = s2.length();
if (s1len == 0)
return s2len;
if (s2len == 0)
return s1len;


// we use a flat array for better performance. we address it by
// s1ix + s1len * s2ix. this modification improves performance
// by about 30%, which is definitely worth the extra complexity.
int[] matrix = new int[(s1len + 1) * (s2.length() + 1)];
for (int col = 0; col <= s2.length(); col++)
matrix[col * s1len] = col;
int[] matrix = new int[(s1len + 1) * (s2len + 1)];
for (int col = 0; col <= s2len; col++)
matrix[col * (s1len + 1)] = col;
for (int row = 0; row <= s1len; row++)
matrix[row] = row;

Expand All @@ -75,11 +77,11 @@ public static int distance(String s1, String s2) {
else
cost = 1;

int left = matrix[ix1 + ((ix2 + 1) * s1len)] + 1;
int above = matrix[ix1 + 1 + (ix2 * s1len)] + 1;
int aboveleft = matrix[ix1 + (ix2 * s1len)] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * s1len)] =
Math.min(left, Math.min(above, aboveleft));
int left = matrix[ix1 + ((ix2 + 1) * (s1len + 1))] + 1;
int above = matrix[ix1 + 1 + (ix2 * (s1len + 1))] + 1;
int aboveleft = matrix[ix1 + (ix2 * (s1len + 1))] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * (s1len + 1))] =
Math.min(left, Math.min(above, aboveleft));
}
}

Expand All @@ -89,10 +91,10 @@ public static int distance(String s1, String s2) {
// }
// System.out.println();
// }
return matrix[s1len + (s2.length() * s1len)];

return matrix[(s1len + 1) * (s2.length() + 1)-1];
}

// /**
// * An optimized version of the Wagner & Fischer algorithm, which
// * exploits our knowledge that if the distance is above a certain
Expand Down Expand Up @@ -138,7 +140,7 @@ public static int distance(String s1, String s2) {
// matrix[ix1 + 1 + ((ix2 + 1) * s1len)] = distance;
// }
// }

// return matrix[s1len + (s2.length() * s1len)];
// }

Expand All @@ -163,7 +165,7 @@ public static int distance(String s1, String s2) {
// // FIXME: modify to avoid having to initialize
// for (int ix = 1; ix < matrix.length; ix++)
// matrix[ix] = -1;

// return computeRecursively(matrix, s1, s2, s1.length(), s2.length());
// }

Expand Down Expand Up @@ -213,7 +215,7 @@ public static int distance(String s1, String s2) {
// else
// // it' can't be smaller than above, so no need to compute
// left = above;

// int distance = Math.min(left, Math.min(above, aboveleft)) + cost;
// matrix[pos] = distance;
// return distance;
Expand All @@ -233,7 +235,7 @@ public static int compactDistance(String s1, String s2) {

// the maximum edit distance there is any point in reporting.
int maxdist = Math.min(s1.length(), s2.length()) / 2;

// we allocate just one column instead of the entire matrix, in
// order to save space. this also enables us to implement the
// algorithm somewhat faster. the first cell is always the
Expand Down Expand Up @@ -271,7 +273,7 @@ public static int compactDistance(String s1, String s2) {
// aboveleft: column[ix1 - 1]
// left: column[ix1]
int value = Math.min(Math.min(above, column[ix1 - 1]), column[ix1]) +
cost;
cost;
column[ix1 - 1] = above; // write previous
above = value; // keep current
smallest = Math.min(smallest, value);
Expand All @@ -285,5 +287,5 @@ public static int compactDistance(String s1, String s2) {

// ok, we're done
return above;
}
}
}
Original file line number Diff line number Diff line change
Expand Up @@ -46,37 +46,38 @@ public WeightEstimator getEstimator() {

public static double distance(String s1, String s2, WeightEstimator weight) {
int s1len = s1.length();
int s2len = s2.length();
if (s1len == 0)
return estimateCharacters(s2, weight);
if (s2.length() == 0)
if (s2len == 0)
return estimateCharacters(s1, weight);

// we use a flat array for better performance. we address it by
// s1ix + s1len * s2ix. this modification improves performance
// by about 30%, which is definitely worth the extra complexity.
double[] matrix = new double[(s1len + 1) * (s2.length() + 1)];
for (int col = 0; col <= s2.length(); col++)
matrix[col * s1len] = col;
double[] matrix = new double[(s1len + 1) * (s2len + 1)];
for (int col = 0; col <= s2len; col++)
matrix[col * (s1len + 1)] = col;
for (int row = 0; row <= s1len; row++)
matrix[row] = row;

for (int ix1 = 0; ix1 < s1len; ix1++) {
char ch1 = s1.charAt(ix1);
for (int ix2 = 0; ix2 < s2.length(); ix2++) {
for (int ix2 = 0; ix2 < s2len; ix2++) {
double cost;
char ch2 = s2.charAt(ix2);
if (ch1 == ch2)
cost = 0;
else
cost = weight.substitute(ix1, ch1, s2.charAt(ix2));

double left = matrix[ix1 + ((ix2 + 1) * s1len)] +
weight.delete(ix1, ch1);
double above = matrix[ix1 + 1 + (ix2 * s1len)] +
weight.insert(ix1, ch2);
double aboveleft = matrix[ix1 + (ix2 * s1len)] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * s1len)] =
Math.min(left, Math.min(above, aboveleft));
double left = matrix[ix1 + ((ix2 + 1) * (s1len + 1))] +
weight.delete(ix1, ch1);
double above = matrix[ix1 + 1 + (ix2 * (s1len + 1))] +
weight.insert(ix1, ch2);
double aboveleft = matrix[ix1 + (ix2 * (s1len + 1))] + cost;
matrix[ix1 + 1 + ((ix2 + 1) * (s1len + 1))] =
Math.min(left, Math.min(above, aboveleft));
}
}

Expand All @@ -87,7 +88,7 @@ public static double distance(String s1, String s2, WeightEstimator weight) {
// System.out.println();
// }

return matrix[s1len + (s2.length() * s1len)];
return matrix[(s1len +1) * (s2len + 1) - 1];
}

// /**
Expand Down Expand Up @@ -249,7 +250,7 @@ else if (Character.isDigit(ch))
int type = Character.getType(ch);
// 20, 21, 22, 23, 24, 25, 26, 27
if (Character.isSpace(ch) ||
(type >= 20 && type <= 27))
(type >= 20 && type <= 27))
weight = punctuation;
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ public class LevenshteinTest {
public void setup() {
this.comp = new Levenshtein();
}

// tests for the comparator

@Test
Expand All @@ -31,9 +31,9 @@ public void testComparatorTotallyDifferent() {
public void testComparatorOneInFour() {
assertEquals(0.75, comp.compare("fooz", "foos"));
}

// tests for the original algorithm

@Test
public void testEmpty() {
assertEquals(0, Levenshtein.distance("", ""));
Expand All @@ -60,13 +60,19 @@ public void testDays() {
assertEquals(3, Levenshtein.distance("saturday", "sunday"));
assertEquals(3, Levenshtein.distance("sunday", "saturday"));
}

@Test
public void testGambol() {
assertEquals(2, Levenshtein.distance("gambol", "gumbo"));
assertEquals(2, Levenshtein.distance("gumbo", "gambol"));
}

@Test
public void testAbc() {
assertEquals(2, Levenshtein.distance("a", "abc"));
assertEquals(2, Levenshtein.distance("abc", "a"));
}

@Test
public void testTotallyUnlike() {
assertEquals(4, Levenshtein.distance("abcd", "efgh"));
Expand Down Expand Up @@ -100,7 +106,7 @@ public void testCDays() {
assertEquals(3, Levenshtein.compactDistance("saturday", "sunday"));
assertEquals(3, Levenshtein.compactDistance("sunday", "saturday"));
}

@Test
public void testCGambol() {
assertEquals(2, Levenshtein.compactDistance("gambol", "gumbo"));
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@ public class WeightedLevenshteinTest {
public void setup() {
e = new WeightedLevenshtein.DefaultWeightEstimator();
}

@Test
public void testEmpty() {
assertEquals(0.0, WeightedLevenshtein.distance("", "", e));
Expand Down Expand Up @@ -43,6 +43,26 @@ public void testSubstitute2() {
assertEquals(3.0, WeightedLevenshtein.distance("totanic 1", "titanic 2", e));
}

@Test
public void testAbc() {
assertEquals(2.0, WeightedLevenshtein.distance("abc", "a", e));
assertEquals(2.0, WeightedLevenshtein.distance("a", "abc", e));
}

@Test
public void test123() {
e.setDigitWeight(2.0);
assertEquals(4.0, WeightedLevenshtein.distance("1", "123", e));
assertEquals(4.0, WeightedLevenshtein.distance("123", "1", e));
}

@Test
public void testAlphaNumeric() {
e.setDigitWeight(2.0);
assertEquals(8.0, WeightedLevenshtein.distance("a2c3e", "1b1d1", e));
assertEquals(8.0, WeightedLevenshtein.distance("1b1d1", "a2c3e", e));
}

@Test
public void testComparator() {
WeightedLevenshtein comp = new WeightedLevenshtein();
Expand Down

0 comments on commit 40d0c99

Please sign in to comment.