Javascript / jQuery Find duplicate text

As you can find duplicates in a text document. Duplicates can be a set of consecutive words or sentences. A sentence does not necessarily end with a period. let's say the page contains a document of 200 lines, of which 2 sentences are identical, we want to select these 2 sentences as duplicates when you click the "Check duplicate" button.

+7
source share
3 answers

An interesting question is the idea of โ€‹โ€‹how I could do this: http://jsfiddle.net/SaQAs/1/ - Not optimized!

var text = $('p').text(), words = text.split(' '), sortedWords = words.slice(0).sort(), duplicateWords = [], sentences = text.split('.'), sortedSentences = sentences.slice(0).sort(), duplicateSentences = []; for (var i=0; i<sortedWords.length-1; i++) { if (sortedWords[i+1] == sortedWords[i]) { duplicateWords.push(sortedWords[i]); } } duplicateWords = $.unique(duplicateWords); for (var i=0; i<sortedSentences.length-1; i++) { if (sortedSentences[i+1] == sortedSentences[i]) { duplicateSentences.push(sortedSentences[i]); } } duplicateSentences = $.unique(duplicateSentences); $('a.words').click(function(){ var highlighted = $.map(words, function(word){ if ($.inArray(word, duplicateWords) > -1) return '<span class="duplicate">' + word + '</span>'; else return word; }); $('p').html(highlighted.join(' ')); return false; }); $('a.sentences').click(function(){ var highlighted = $.map(sentences, function(sentence){ if ($.inArray(sentence, duplicateSentences) > -1) return '<span class="duplicate">' + sentence + '</span>'; else return sentence; }); $('p').html(highlighted.join('.')); return false; }); 

Update 1

Here you will find sequences of identical words: http://jsfiddle.net/YQdk5/1/ From here it should not be difficult, for example. ignore any punctuation marks at the end of fragments when comparing - you just need to write your own version of the inArray method.

 var text = $('p').text(), words = text.split(' '), sortedWords = words.slice(0).sort(), duplicateWords = [] highlighted = []; for (var i=0; i<sortedWords.length-1; i++) { if (sortedWords[i+1] == sortedWords[i]) { duplicateWords.push(sortedWords[i]); } } duplicateWords = $.unique(duplicateWords); for (var j=0, m=[]; j<words.length; j++) { m.push($.inArray(words[j], duplicateWords) > -1); if (!m[j] && m[j-1]) highlighted.push('</span>'); else if (m[j] && !m[j-1]) highlighted.push('<span class="duplicate">'); highlighted.push(words[j]); } $('p').html(highlighted.join(' ')); 

Update 2

My regex-fu is weak, but this (pretty dirty!) Version works fine: http://jsfiddle.net/YQdk5/2/ - I'm pretty sure there might be a better way to do this, but for now I have to leave it alone !: D - Good luck!

Update 3

Thinking about this, I donโ€™t think the code from the previous update is good. That is why I deleted it. You can still find it here: http://jsfiddle.net/YQdk5/2/ The main thing is to use a regular expression to match words, something like strings:

 /^word(\.?)$/ 
+5
source

Here is a solution using a suffix tree:

 function SuffixTree(text) { var regex = /\b\w+/g; var words = text.match(regex); var wave = []; var words_l = words.length; if (words_l == 0) return false; this.tree = this.node("", false); for (var i = 0; i < words_l; ++i) { var x = words[i] + "_"; wave.push(this.tree); var wave_l = wave.length; for (var j = 0; j < wave_l; ++j) { var y = wave[j]; if (typeof y[x] != 'undefined') y[x].count++; else y[x] = this.node(words[i], y); wave[j] = y[x]; } } } SuffixTree.prototype = { dummy: {count: 1}, node: function(word, num, parent) { return { count: 1, word: word, parent: parent }; }, duplicates: function(h) { this.dups = []; this.bypass(this.tree, h, 0); var l = this.dups.length; this.dups.sort(function(d1, d2) { return d1.depth > d2.depth ? 1 : -1; }); for (var i = 0; i < l; ++i) { var d = this.dups[i]; this.dups[i] = { s: " " + this.sentence(da) + " ", depth: d.depth, count: dacount }; } for (var i = 0; i < l; ++i) { var d = this.dups[i]; console.log(i, ds); } for (var i = 0; i < l; ++i) { var d = this.dups[i]; var fl = true; for (var j = i + 1; j < l; ++j) { if (this.dups[j].s.indexOf(ds) != -1) fl = false; } if (fl) h(dssubstr(1, dslength - 2), d.count); } }, bypass: function(a, h, depth) { if (a.constructor != Object) return; var fl = true; for (var i in a) { if (i == 'parent') continue; var b = a[i]; if (b.count == a.count) fl = false; this.bypass(b, h, depth + 1); } if (fl && a.count > 1) { this.dups.push({ a: a, depth: depth }); } }, sentence: function(a) { var s = a.word; while (a = a.parent) { s = a.word + " " + s; } return s; } }; var text = "This is a text with some duplicates: words, sentences of different length. For example here is a duplicate word. This sentence has some duplicates. But not all of us can find clones."; var T = new SuffixTree(text); var h = function(s, c) { document.write(s + "[" + c + "]<br/>"); }; T.duplicates(h); 

1) Divide the input text into an array of words. 2) Create a suffix tree. 3) Find the longest tree suffixes. 4) Delete sentences that are contained in others (ie Delete "is", which is part of "this is").

You can change the regular expression to take into account html tags.

Hope this helps you.

PS h is a callback for found duplicates.

+3
source

You javascript contains links to a javascript library called jQuery.

You do not include this in your HTML, and therefore it will not work. You can enable it through jquery cdn

And today's tip: use the developer tools in your browser. In the console, you can see which parts of javascript are failing.

0
source

All Articles