Here is a solution using a suffix tree:
function SuffixTree(text) { var regex = /\b\w+/g; var words = text.match(regex); var wave = []; var words_l = words.length; if (words_l == 0) return false; this.tree = this.node("", false); for (var i = 0; i < words_l; ++i) { var x = words[i] + "_"; wave.push(this.tree); var wave_l = wave.length; for (var j = 0; j < wave_l; ++j) { var y = wave[j]; if (typeof y[x] != 'undefined') y[x].count++; else y[x] = this.node(words[i], y); wave[j] = y[x]; } } } SuffixTree.prototype = { dummy: {count: 1}, node: function(word, num, parent) { return { count: 1, word: word, parent: parent }; }, duplicates: function(h) { this.dups = []; this.bypass(this.tree, h, 0); var l = this.dups.length; this.dups.sort(function(d1, d2) { return d1.depth > d2.depth ? 1 : -1; }); for (var i = 0; i < l; ++i) { var d = this.dups[i]; this.dups[i] = { s: " " + this.sentence(da) + " ", depth: d.depth, count: dacount }; } for (var i = 0; i < l; ++i) { var d = this.dups[i]; console.log(i, ds); } for (var i = 0; i < l; ++i) { var d = this.dups[i]; var fl = true; for (var j = i + 1; j < l; ++j) { if (this.dups[j].s.indexOf(ds) != -1) fl = false; } if (fl) h(dssubstr(1, dslength - 2), d.count); } }, bypass: function(a, h, depth) { if (a.constructor != Object) return; var fl = true; for (var i in a) { if (i == 'parent') continue; var b = a[i]; if (b.count == a.count) fl = false; this.bypass(b, h, depth + 1); } if (fl && a.count > 1) { this.dups.push({ a: a, depth: depth }); } }, sentence: function(a) { var s = a.word; while (a = a.parent) { s = a.word + " " + s; } return s; } }; var text = "This is a text with some duplicates: words, sentences of different length. For example here is a duplicate word. This sentence has some duplicates. But not all of us can find clones."; var T = new SuffixTree(text); var h = function(s, c) { document.write(s + "[" + c + "]<br/>"); }; T.duplicates(h);
1) Divide the input text into an array of words. 2) Create a suffix tree. 3) Find the longest tree suffixes. 4) Delete sentences that are contained in others (ie Delete "is", which is part of "this is").
You can change the regular expression to take into account html tags.
Hope this helps you.
PS h is a callback for found duplicates.
artyom.stv
source share