Naive Bayes Bayes Error

Hey, I'm trying to use the Naive Bayes classifier to classify some text. I am using NLTK. Whenever I test a classifier using the classify () method, it always returns the correct classification for the first element and the same classification for every other line of text that I am classifying. Below is my code:

from nltk.corpus import movie_reviews from nltk.tokenize import word_tokenize import nltk import random import nltk.data documents = [(list(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] def bag_of_words(words): return dict([word,True] for word in words) def document_features(document): document_words = set(document) features = {} for word in word_features: features['contains(%s)' % word] = (word in document_words) return features featuresets = [(document_features(d), c) for (d,c) in documents] train_set, test_set = featuresets[100:], featuresets[:100] classifier = nltk.NaiveBayesClassifier.train(train_set) text1="i love this city" text2="i hate this city" feats1=bag_of_words(word_tokenize(text1)) feats2=bag_of_words(word_tokenize(text2)) print classifier.classify(feats1) print classifier.classify(feats2) 

This code will print the word twice, as if I turned the last 2 lines of the code, it will print neg twice. Can anyone help?

+7
source share
1 answer

Edit

 features['contains(%s)' % word] = (word in document_words) 

to

 features[word] = (word in document) 

Otherwise, the classifier knows only about the "words" of the form "contains (...)" and therefore does not know the words in "i love this city"


 import nltk.tokenize as tokenize import nltk import random random.seed(3) def bag_of_words(words): return dict([word, True] for word in words) def document_features(document): features = {} for word in word_features: features[word] = (word in document) # features['contains(%s)' % word] = (word in document_words) return features movie_reviews = nltk.corpus.movie_reviews documents = [(set(movie_reviews.words(fileid)), category) for category in movie_reviews.categories() for fileid in movie_reviews.fileids(category)] random.shuffle(documents) all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words()) word_features = all_words.keys()[:2000] train_set = [(document_features(d), c) for (d, c) in documents[:200]] classifier = nltk.NaiveBayesClassifier.train(train_set) classifier.show_most_informative_features() for word in ('love', 'hate'): # No hope in passing the tests if word is not in word_features assert word in word_features print('probability {w!r} is positive: {p:.2%}'.format( w = word, p = classifier.prob_classify({word : True}).prob('pos'))) tests = ["i love this city", "i hate this city"] for test in tests: words = tokenize.word_tokenize(test) feats = bag_of_words(words) print('{s} => {c}'.format(s = test, c = classifier.classify(feats))) 

gives

 Most Informative Features worst = True neg : pos = 15.5 : 1.0 ridiculous = True neg : pos = 11.5 : 1.0 batman = True neg : pos = 7.6 : 1.0 drive = True neg : pos = 7.6 : 1.0 blame = True neg : pos = 7.6 : 1.0 terrible = True neg : pos = 6.9 : 1.0 rarely = True pos : neg = 6.4 : 1.0 cliches = True neg : pos = 6.0 : 1.0 $ = True pos : neg = 5.9 : 1.0 perfectly = True pos : neg = 5.5 : 1.0 probability 'love' is positive: 61.52% probability 'hate' is positive: 36.71% i love this city => pos i hate this city => neg 
+4
source

All Articles