How can I measure line similarity between sentences?

I have the following task.

The specified list of lines:

        var strings = [
            'Steve jobs created the iPod when he was at Apple',
            'I really like the new Macbook by Apple',
            'Jony Ive was concerned being fired by Steve Jobs after his return to Apple',
            'The new Macbook has just one USB-C type connector',
            'I like bananas',
            'The brezels I can buy in my local store are much better than the ones in the supermarket',
            'the',
            'foo',
            'Steve'
        ];

Now I want to compare each row with each other, and for each comparison I want to find out how similar they are to each other on a scale of 0-1 (or 0% -100%).

So, I searched a little Google and found this: Comparing similarity strings in Java

So, I followed the instructions there and ported the method similarity(String s1, String s2)to JavaScript:

        function similarity(s1, s2) {
            var longer = s1;
            var shorter = s2;
            if (s1.length < s2.length) {
                longer = s2;
                shorter = s1;
            }
            var longerLength = longer.length;
            if (longerLength == 0) {
                return 1.0;
            }
            return (longerLength - longer.LevenshteinDistance(shorter)) / longerLength;
        }

As a comparison algorithm, I used Levenshtein:

        String.prototype.LevenshteinDistance = function (s2) {
            var array = new Array(this.length + 1);
            for (var i = 0; i < this.length + 1; i++)
                array[i] = new Array(s2.length + 1);

            for (var i = 0; i < this.length + 1; i++)
                array[i][0] = i;
            for (var j = 0; j < s2.length + 1; j++)
                array[0][j] = j;

            for (var i = 1; i < this.length + 1; i++) {
                for (var j = 1; j < s2.length + 1; j++) {
                    if (this[i - 1] == s2[j - 1]) array[i][j] = array[i - 1][j - 1];
                    else {
                        array[i][j] = Math.min(array[i][j - 1] + 1, array[i - 1][j] + 1);
                        array[i][j] = Math.min(array[i][j], array[i - 1][j - 1] + 1);
                    }
                }
            }
            return array[this.length][s2.length];
        };

So, as a test, I spent a full cycle comparing each line with each other and printing the result as follows:

            for (var i in strings){
                var s = strings[i];
                print('Checking string: "' + s + '"');
                for (var j in strings){
                    print('-----');
                    var s2 = strings[j];
                    print('vs "' + s2 + '"');
                    var sim = similarity(s, s2);
                    print('Similarity: ' + Math.round(sim*100) + '%');
                }
                print('<br>////// NEXT /////////////////////////////////////////////////<br>');
            }

Ok, now this is the result: https://jsfiddle.net/wxksfa4w/

, , , , , :

" iPod, Apple" " " 13%?

" iPod, Apple", "" 10%, "" ?

? ? , , , , 1 2. , , -, , .

?

+4
3

, . - tf-idf

+1

SimMetrics Java Smith Waterman Gotoh, . Smith Waterman Gotoh , .

0

All Articles