For a client-side search tool, I need to find Levenshtein’s distance from a word with millions of other words. The user should be able to compare a short text of about twenty words with a book. The user could do this by finding the locations of the most characteristic words of the text in the book. "Searching for places" does not mean that you are looking for an exact match, but almost match Levenshtein. I started with the available implementations, but I needed a higher speed. I ended up with this:
var rowA = new Uint16Array(1e6); var rowB = new Uint16Array(1e6); function levenshtein(s1, s2) { var s1_len = s1.length, s2_len = s2.length, i1, i2 = 0, a, b, c, c2, i = 0; if (s1_len === 0) return s2_len; if (s2_len === 0) return s1_len; while (i < s1_len) rowA[i] = ++i; while (i2 < s2_len) { c2 = s2[i2]; a = i2; ++i2; b = i2; for (i1 = 0; i1 < s1_len; ++i1) { c = a + (s1[i1] !== c2 ); a = rowA[i1]; b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c); rowB[i1] = b; } if (i2 === s2_len) return b; c2 = s2[i2]; a = i2; ++i2; b = i2; for (i1 = 0; i1 < s1_len; ++i1) { c = a + (s1[i1] !== c2 ); a = rowB[i1]; b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c); rowA[i1] = b; } } return b; }
As you can see, I used methods like putting objects from a function to use them. I also repeated myself a bit, linearizing the loop somewhat. Could it be faster? I'm curious about your advice.
Update: After advice from Bergi and some other considerations, I came to this solution:
var row = new Uint16Array(1e6); function levenshtein(s1, s2) { var s1_len = s1.length, s2_len = s2.length, i2 = 1, a, b = 0, c, c2, i1 = 0; if (s1_len === 0) return s2_len; if (s2_len === 0) return s1_len; c2 = s2[0]; if (s1[0] === c2) { while (i1 < s1_len) { row[i1] = i1++; } b = s1_len - 1; } else { row[0] = 1; ++b; if (s1_len > 1) for (i1 = 1; i1 < s1_len; ++i1) { if (s1[i1] === c2) { row[i1] = b; for (++i1; i1 < s1_len; ++i1) { row[i1] = ++b; } } else { row[i1] = ++b; } } } if (s2_len > 1) while (i2 < s2_len) { c2 = s2[i2]; c = i2 + (s1[0] !== c2); a = row[0]; ++i2; b = i2 < a ? (i2 < c ? i2 + 1 : c) : (a < c ? a + 1 : c); row[0] = b; if (s1_len > 1) { for (i1 = 1; i1 < s1_len; ++i1) { c = a + (s1[i1] !== c2); a = row[i1]; b = b < a ? (b < c ? b + 1 : c) : (a < c ? a + 1 : c); row[i1] = b; } } } return b; }
This is again much faster. I can no longer squeeze out of it. I keep looking for other ideas and try some more.
javascript algorithm levenshtein distance
Marco de wit
source share