/** * Mnemonist PassjoinIndex * ======================== * * The PassjoinIndex is an index leveraging the "passjoin" algorithm as a mean * to index strings for Levenshtein distance queries. It features a complexity * related to the Levenshtein query threshold k rather than the number of * strings to test (roughly O(k^3)). * * [References]: * Jiang, Yu, Dong Deng, Jiannan Wang, Guoliang Li, et Jianhua Feng. * « Efficient Parallel Partition-Based Algorithms for Similarity Search and Join * with Edit Distance Constraints ». In Proceedings of the Joint EDBT/ICDT 2013 * Workshops on - EDBT ’13, 341. Genoa, Italy: ACM Press, 2013. * https://doi.org/10.1145/2457317.2457382. * * Li, Guoliang, Dong Deng, et Jianhua Feng. « A Partition-Based Method for * String Similarity Joins with Edit-Distance Constraints ». ACM Transactions on * Database Systems 38, no 2 (1 juin 2013): 1‑33. * https://doi.org/10.1145/2487259.2487261. * * [Urls]: * http://people.csail.mit.edu/dongdeng/projects/passjoin/index.html */ var Iterator = require('obliterator/iterator'), forEach = require('obliterator/foreach'); // TODO: leveraging BagDistance as an upper bound of Levenshtein // TODO: leverage n-grams recursive indexing // TODO: try the MultiArray as a memory backend // TODO: what about damerau levenshtein /** * Helpers. */ /** * Function returning the number of substrings that will be selected by the * multi-match-aware selection scheme for theshold `k`, for a string of length * `s` to match strings of length `l`. * * @param {number} k - Levenshtein distance threshold. * @param {number} s - Length of target strings. * @param {number} l - Length of strings to match. * @returns {number} - The number of selected substrings. */ function countSubstringsL(k, s, l) { return (((Math.pow(k, 2) - Math.pow(Math.abs(s - l), 2)) / 2) | 0) + k + 1; } /** * Function returning the minimum number of substrings that will be selected by * the multi-match-aware selection scheme for theshold `k`, for a string of * length `s` to match any string of relevant length. * * @param {number} k - Levenshtein distance threshold. * @param {number} s - Length of target strings. * @returns {number} - The number of selected substrings. */ function countKeys(k, s) { var c = 0; for (var l = 0, m = s + 1; l < m; l++) c += countSubstringsL(k, s, l); return c; } /** * Function used to compare two keys in order to sort them first by decreasing * length and then alphabetically as per the "4.2 Effective Indexing Strategy" * point of the paper. * * @param {number} k - Levenshtein distance threshold. * @param {number} s - Length of target strings. * @returns {number} - The number of selected substrings. */ function comparator(a, b) { if (a.length > b.length) return -1; if (a.length < b.length) return 1; if (a < b) return -1; if (a > b) return 1; return 0; } /** * Function partitioning a string into k + 1 uneven segments, the shorter * ones, then the longer ones. * * @param {number} k - Levenshtein distance threshold. * @param {number} l - Length of the string. * @returns {Array} - The partition tuples (start, length). */ function partition(k, l) { var m = k + 1, a = (l / m) | 0, b = a + 1, i, j; var largeSegments = l - a * m, smallSegments = m - largeSegments; var tuples = new Array(k + 1); for (i = 0; i < smallSegments; i++) tuples[i] = [i * a, a]; var offset = (i - 1) * a + a; for (j = 0; j < largeSegments; j++) tuples[i + j] = [offset + j * b, b]; return tuples; } /** * Function yielding a string's k + 1 passjoin segments to index. * * @param {number} k - Levenshtein distance threshold. * @param {string} string - Target string. * @returns {Array} - The string's segments. */ function segments(k, string) { var l = string.length, m = k + 1, a = (l / m) | 0, b = a + 1, o, i, j; var largeSegments = l - a * m, smallSegments = m - largeSegments; var S = new Array(k + 1); for (i = 0; i < smallSegments; i++) { o = i * a; S[i] = string.slice(o, o + a); } var offset = (i - 1) * a + a; for (j = 0; j < largeSegments; j++) { o = offset + j * b; S[i + j] = string.slice(o, o + b); } return S; } // TODO: jsdocs function segmentPos(k, i, string) { if (i === 0) return 0; var l = string.length; var m = k + 1, a = (l / m) | 0, b = a + 1; var largeSegments = l - a * m, smallSegments = m - largeSegments; if (i <= smallSegments - 1) return i * a; var offset = i - smallSegments; return smallSegments * a + offset * b; } /** * Function returning the interval of relevant substrings to lookup using the * multi-match-aware substring selection scheme described in the paper. * * @param {number} k - Levenshtein distance threshold. * @param {number} delta - Signed length difference between both considered strings. * @param {number} i - k + 1 segment index. * @param {number} s - String's length. * @param {number} pi - k + 1 segment position in target string. * @param {number} li - k + 1 segment length. * @returns {Array} - The interval (start, stop). */ function multiMatchAwareInterval(k, delta, i, s, pi, li) { var start1 = pi - i, end1 = pi + i; var o = k - i; var start2 = pi + delta - o, end2 = pi + delta + o; var end3 = s - li; return [Math.max(0, start1, start2), Math.min(end1, end2, end3)]; } /** * Function yielding relevant substrings to lookup using the multi-match-aware * substring selection scheme described in the paper. * * @param {number} k - Levenshtein distance threshold. * @param {string} string - Target string. * @param {number} l - Length of strings to match. * @param {number} i - k + 1 segment index. * @param {number} pi - k + 1 segment position in target string. * @param {number} li - k + 1 segment length. * @returns {Array} - The contiguous substrings. */ function multiMatchAwareSubstrings(k, string, l, i, pi, li) { var s = string.length; // Note that we need to keep the non-absolute delta for this function // to work in both directions, up & down var delta = s - l; var interval = multiMatchAwareInterval(k, delta, i, s, pi, li); var start = interval[0], stop = interval[1]; var currentSubstring = ''; var substrings = []; var substring, j, m; for (j = start, m = stop + 1; j < m; j++) { substring = string.slice(j, j + li); // We skip identical consecutive substrings (to avoid repetition in case // of contiguous letter duplication) if (substring === currentSubstring) continue; substrings.push(substring); currentSubstring = substring; } return substrings; } /** * PassjoinIndex. * * @note I tried to apply the paper's optimizations regarding Levenshtein * distance computations but it did not provide a performance boost, quite * the contrary. This is because since we are mostly using the index for small k * here, most of the strings we work on are quite small and the bookkeeping * induced by Ukkonen's method and the paper's one are slowing us down more than * they actually help us go faster. * * @note This implementation does not try to ensure that you add the same string * more than once. * * @constructor * @param {function} levenshtein - Levenshtein distance function. * @param {number} k - Levenshtein distance threshold. */ function PassjoinIndex(levenshtein, k) { if (typeof levenshtein !== 'function') throw new Error('mnemonist/passjoin-index: `levenshtein` should be a function returning edit distance between two strings.'); if (typeof k !== 'number' || k < 1) throw new Error('mnemonist/passjoin-index: `k` should be a number > 0'); this.levenshtein = levenshtein; this.k = k; this.clear(); } /** * Method used to clear the structure. * * @return {undefined} */ PassjoinIndex.prototype.clear = function() { // Properties this.size = 0; this.strings = []; this.invertedIndices = {}; }; /** * Method used to add a new value to the index. * * @param {string|Array} value - Value to add. * @return {PassjoinIndex} */ PassjoinIndex.prototype.add = function(value) { var l = value.length; var stringIndex = this.size; this.strings.push(value); this.size++; var S = segments(this.k, value); var Ll = this.invertedIndices[l]; if (typeof Ll === 'undefined') { Ll = {}; this.invertedIndices[l] = Ll; } var segment, matches, key, i, m; for (i = 0, m = S.length; i < m; i++) { segment = S[i]; key = segment + i; matches = Ll[key]; if (typeof matches === 'undefined') { matches = [stringIndex]; Ll[key] = matches; } else { matches.push(stringIndex); } } return this; }; /** * Method used to search for string matching the given query. * * @param {string|Array} query - Query string. * @return {Array} */ PassjoinIndex.prototype.search = function(query) { var s = query.length, k = this.k; var M = new Set(); var candidates, candidate, queryPos, querySegmentLength, key, S, P, l, m, i, n1, j, n2, y, n3; for (l = Math.max(0, s - k), m = s + k + 1; l < m; l++) { var Ll = this.invertedIndices[l]; if (typeof Ll === 'undefined') continue; P = partition(k, l); for (i = 0, n1 = P.length; i < n1; i++) { queryPos = P[i][0]; querySegmentLength = P[i][1]; S = multiMatchAwareSubstrings( k, query, l, i, queryPos, querySegmentLength ); // Empty string edge case if (!S.length) S = ['']; for (j = 0, n2 = S.length; j < n2; j++) { key = S[j] + i; candidates = Ll[key]; if (typeof candidates === 'undefined') continue; for (y = 0, n3 = candidates.length; y < n3; y++) { candidate = this.strings[candidates[y]]; // NOTE: first condition is here not to compute Levenshtein // distance for tiny strings // NOTE: maintaining a Set of rejected candidate is not really useful // because it consumes more memory and because non-matches are // less likely to be candidates agains if ( s <= k && l <= k || ( !M.has(candidate) && this.levenshtein(query, candidate) <= k ) ) M.add(candidate); } } } } return M; }; /** * Method used to iterate over the index. * * @param {function} callback - Function to call for each item. * @param {object} scope - Optional scope. * @return {undefined} */ PassjoinIndex.prototype.forEach = function(callback, scope) { scope = arguments.length > 1 ? scope : this; for (var i = 0, l = this.strings.length; i < l; i++) callback.call(scope, this.strings[i], i, this); }; /** * Method used to create an iterator over a index's values. * * @return {Iterator} */ PassjoinIndex.prototype.values = function() { var strings = this.strings, l = strings.length, i = 0; return new Iterator(function() { if (i >= l) return { done: true }; var value = strings[i]; i++; return { value: value, done: false }; }); }; /** * Attaching the #.values method to Symbol.iterator if possible. */ if (typeof Symbol !== 'undefined') PassjoinIndex.prototype[Symbol.iterator] = PassjoinIndex.prototype.values; /** * Convenience known methods. */ PassjoinIndex.prototype.inspect = function() { var array = this.strings.slice(); // Trick so that node displays the name of the constructor Object.defineProperty(array, 'constructor', { value: PassjoinIndex, enumerable: false }); return array; }; if (typeof Symbol !== 'undefined') PassjoinIndex.prototype[Symbol.for('nodejs.util.inspect.custom')] = PassjoinIndex.prototype.inspect; /** * Static @.from function taking an arbitrary iterable & converting it into * a structure. * * @param {Iterable} iterable - Target iterable. * @return {PassjoinIndex} */ PassjoinIndex.from = function(iterable, levenshtein, k) { var index = new PassjoinIndex(levenshtein, k); forEach(iterable, function(string) { index.add(string); }); return index; }; /** * Exporting. */ PassjoinIndex.countKeys = countKeys; PassjoinIndex.comparator = comparator; PassjoinIndex.partition = partition; PassjoinIndex.segments = segments; PassjoinIndex.segmentPos = segmentPos; PassjoinIndex.multiMatchAwareInterval = multiMatchAwareInterval; PassjoinIndex.multiMatchAwareSubstrings = multiMatchAwareSubstrings; module.exports = PassjoinIndex;