I am trying to calculate the likelihood that the email will be spam from Naive Bayes. I have a document class for creating documents (filed from a website) and another class for teaching and classifying documents. My train function calculates all the unique conditions in all documents, all documents in the spam class, all documents in a class other than spam, calculates the previous probabilities (one for spam, the other for ham). Then I use the following formula to store conditional probabilities for each member in a dict

Tct = number of member occurrences in a given class
Tct 'is # a term in terms in a given class
B' = # unique terms in all documents
classes = spam or ham
spam = spam, ham = not spam
, , , , 2.461114392596968e-05. , , Tct (, 5 8) Tct '( 64878 308930 ) B' ( 16386). , condprob .00034155, , condprob , . - ? ?
, - , 327.82, 758.80 138.66

, condprob, .
-
class Document(object):
"""
The instance variables are:
filename....The path of the file for this document.
label.......The true class label ('spam' or 'ham'), determined by whether the filename contains the string 'spmsg'
tokens......A list of token strings.
"""
def __init__(self, filename=None, label=None, tokens=None):
""" Initialize a document either from a file, in which case the label
comes from the file name, or from specified label and tokens, but not
both.
"""
if label:
self.label = label
self.tokens = tokens
else:
self.filename = filename
self.label = 'spam' if 'spmsg' in filename else 'ham'
self.tokenize()
def tokenize(self):
self.tokens = ' '.join(open(self.filename).readlines()).split()
-NaiveBayes
class NaiveBayes(object):
def train(self, documents):
"""
Given a list of labeled Document objects, compute the class priors and
word conditional probabilities, following Figure 13.2 of your
book. Store these as instance variables, to be used by the classify
method subsequently.
Params:
documents...A list of training Documents.
Returns:
Nothing.
"""
unique = []
proxy = []
proxy2 = []
proxy3 = []
condprob = [{},{}]
Tct = defaultdict()
Tc_t = defaultdict()
prior = {}
count = 0
oldterms = []
old_terms = []
for a in range(len(documents)):
done = False
for item in documents[a].tokens:
if item not in unique:
unique.append(item)
if documents[a].label == "ham":
proxy2.append(item)
if done == False:
count += 1
elif documents[a].label == "spam":
proxy3.append(item)
done = True
V = unique
N = len(documents)
print("N:",N)
LB = len(unique)
print("THIS IS LB:",LB)
self.V = V
print("THIS IS COUNT/NC", count)
Nc = count
prior["ham"] = Nc / N
self.prior = prior
Nc = len(documents) - count
print("THIS IS SPAM COUNT/NC", Nc)
prior["spam"] = Nc / N
self.prior = prior
text2 = proxy2
text3 = proxy3
TctTotal = len(text2)
Tc_tTotal = len(text3)
print("THIS IS TCTOTAL",TctTotal)
print("THIS IS TC_TTOTAL",Tc_tTotal)
for term in text2:
if term not in oldterms:
Tct[term] = text2.count(term)
oldterms.append(term)
for term in text3:
if term not in old_terms:
Tc_t[term] = text3.count(term)
old_terms.append(term)
for term in V:
if term in text2:
condprob[0].update({term: (Tct[term] + 1) / (TctTotal + LB)})
if term in text3:
condprob[1].update({term: (Tc_t[term] + 1) / (Tc_tTotal + LB)})
print("This is condprob", condprob)
self.condprob = condprob
def classify(self, documents):
""" Return a list of strings, either 'spam' or 'ham', for each document.
Params:
documents....A list of Document objects to be classified.
Returns:
A list of label strings corresponding to the predictions for each document.
"""
unique = []
ans = []
hscore = 0
sscore = 0
for a in range(len(documents)):
for item in documents[a].tokens:
if item not in unique:
unique.append(item)
W = unique
hscore = math.log(float(self.prior['ham']))
sscore = math.log(float(self.prior['spam']))
for t in W:
try:
hscore += math.log(self.condprob[0][t])
except KeyError:
continue
try:
sscore += math.log(self.condprob[1][t])
except KeyError:
continue
print("THIS IS SSCORE",sscore)
print("THIS IS HSCORE",hscore)
unique = []
if hscore > sscore:
str = "Spam"
elif sscore > hscore:
str = "Ham"
ans.append(str)
return ans
-
if not os.path.exists('train'):
from urllib.request import urlretrieve
import tarfile
urlretrieve('http://cs.iit.edu/~culotta/cs429/lingspam.tgz', 'lingspam.tgz')
tar = tarfile.open('lingspam.tgz')
tar.extractall()
tar.close()
train_docs = [Document(filename=f) for f in glob.glob("train/*.txt")]
test_docs = [Document(filename=f) for f in glob.glob("test/*.txt")]
test = train_docs
nb = NaiveBayes()
nb.train(train_docs[1500:])
, , .
, ? , ? , , condprob?
condprob, , :
"": 2.461114392596968e-05, 'fillmore': 2.461114392596968e-05, '796': 2.461114392596968e-05, 'zann': 2.461114392596968e-05
condprob - , , - , - . . "" , .00031235, 3.1235e-05.
, condprob , , ,
-2634.5292392650663, SSCORE -1707.983339196181
327.82, 758.80
~ 1 , 30