Cosine Similarity Using TFIDF

There are several questions about SO and the Internet that describe how to take cosine similaritybetween two lines, and even between two lines with TFIDF as weights. But the output of a function like scikit linear_kernelconfuses me a bit.

Consider the following code:

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

a = ['hello world', 'my name is', 'what is your name?']
b = ['my name is', 'hello world', 'my name is what?']

df = pd.DataFrame(data={'a':a, 'b':b})
df['ab'] = df.apply(lambda x : x['a'] + ' ' + x['b'], axis=1)
print(df.head())

                    a                 b                                   ab
0         hello world        my name is               hello world my name is
1          my name is       hello world               my name is hello world
2  what is your name?  my name is what?  what is your name? my name is what?

Question : I would like to have a column, which is the cosine similarity between rows in aand rows in b.

What I tried :

I trained the TFIDF classifier on abto include all the words:

clf = TfidfVectorizer(ngram_range=(1, 1), stop_words='english')
clf.fit(df['ab'])

Then I got a sparse matrix of TFIDF columns aand b:

tfidf_a = clf.transform(df['a'])
tfidf_b = clf.transform(df['b'])

Now, if I use scikit linear_kernelas recommended by others, I return the Gram matrix (nfeatures, nfeatures) as indicated in their docs.

from sklearn.metrics.pairwise import linear_kernel
linear_kernel(tfidf_a,tfidf_b)

array([[ 0.,  1.,  0.],
       [ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])

, cosin_sim a b, - cos_sim (a [1], b [ 1]) ..

python3, scikit-learn 0.17.

+4
2

, , TfidfVectorizer , stop_words = 'english' ( ). , , . , - ?

import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy import spatial

a = ['hello world', 'my name is', 'what is your name?']
b = ['my name is', 'hello world', 'my name is what?']

df = pd.DataFrame(data={'a':a, 'b':b})
df['ab'] = df.apply(lambda x : x['a'] + ' ' + x['b'], axis=1)

clf = TfidfVectorizer(ngram_range=(1, 1))
clf.fit(df['ab'])

tfidf_a = clf.transform(df['a']).todense()
tfidf_b = clf.transform(df['b']).todense()

row_similarities = [1 - spatial.distance.cosine(tfidf_a[x],tfidf_b[x]) for x in range(len(tfidf_a)) ]
row_similarities

[0.0, 0.0, 0.72252389079716417]

. , , , . , .

+2
dfs = {}
idfs = {}
speeches = {}
speechvecs = {}
total_word_counts = {}

def tokenize(doc):
    tokens = mytokenizer.tokenize(doc)
    lowertokens = [token.lower() for token in tokens]
    filteredtokens = [stemmer.stem(token) for token in lowertokens if not token in sortedstopwords]
    return filteredtokens

def incdfs(tfvec):
    for token in set(tfvec):
        if token not in dfs:
            dfs[token]=1
            total_word_counts[token] = tfvec[token]
        else:
            dfs[token] += 1
            total_word_counts[token] += tfvec[token]


def calctfidfvec(tfvec, withidf):
    tfidfvec = {}
    veclen = 0.0

    for token in tfvec:
        if withidf:
            tfidf = (1+log10(tfvec[token])) * getidf(token)
        else:
            tfidf = (1+log10(tfvec[token]))
        tfidfvec[token] = tfidf 
        veclen += pow(tfidf,2)

    if veclen > 0:
        for token in tfvec: 
            tfidfvec[token] /= sqrt(veclen)

    return tfidfvec

def cosinesim(vec1, vec2):
    commonterms = set(vec1).intersection(vec2)
    sim = 0.0
    for token in commonterms:
        sim += vec1[token]*vec2[token]

    return sim

def query(qstring):
    qvec = getqvec(qstring.lower())
    scores = {filename:cosinesim(qvec,tfidfvec) for filename, tfidfvec in speechvecs.items()}  
    return max(scores.items(), key=operator.itemgetter(1))[0]

def docdocsim(filename1,filename2):
    return cosinesim(gettfidfvec(filename1),gettfidfvec(filename2))
0

All Articles