How to fix my Naive Bayes method returning extremely small conditional probabilities?

Question

How to fix my Naive Bayes method returning extremely small conditional probabilities?

I am trying to calculate the likelihood that the email will be spam from Naive Bayes. I have a document class for creating documents (filed from a website) and another class for teaching and classifying documents. My train function calculates all the unique conditions in all documents, all documents in the spam class, all documents in a class other than spam, calculates the previous probabilities (one for spam, the other for ham). Then I use the following formula to store conditional probabilities for each member in a dict

Tct = number of member occurrences in a given class
Tct 'is # a term in terms in a given class
B' = # unique terms in all documents

classes = spam or ham
spam = spam, ham = not spam

, , , , 2.461114392596968e-05. , , Tct (, 5 8) Tct '( 64878 308930 ) B' ( 16386). , condprob .00034155, , condprob , . - ? ?
, - , 327.82, 758.80 138.66

, condprob, .

-

class Document(object):
"""
The instance variables are:
filename....The path of the file for this document.
label.......The true class label ('spam' or 'ham'), determined by whether the filename contains the string 'spmsg'
tokens......A list of token strings.
"""

def __init__(self, filename=None, label=None, tokens=None):
    """ Initialize a document either from a file, in which case the label
    comes from the file name, or from specified label and tokens, but not
    both.
    """
    if label: # specify from label/tokens, for testing.
        self.label = label
        self.tokens = tokens
    else: # specify from file.
        self.filename = filename
        self.label = 'spam' if 'spmsg' in filename else 'ham'
        self.tokenize()

def tokenize(self):
    self.tokens = ' '.join(open(self.filename).readlines()).split()

-NaiveBayes

class NaiveBayes(object):
def train(self, documents):
    """
    Given a list of labeled Document objects, compute the class priors and
    word conditional probabilities, following Figure 13.2 of your
    book. Store these as instance variables, to be used by the classify
    method subsequently.
    Params:
      documents...A list of training Documents.
    Returns:
      Nothing.
    """
    ###TODO
    unique = []
    proxy = []
    proxy2 = []
    proxy3 = []
    condprob = [{},{}]
    Tct = defaultdict()
    Tc_t = defaultdict()
    prior = {}
    count = 0
    oldterms = []
    old_terms = []
    for a in range(len(documents)):
        done = False
        for item in documents[a].tokens:
            if item not in unique:
                unique.append(item)
            if documents[a].label == "ham":
                proxy2.append(item)
                if done == False:
                    count += 1
            elif documents[a].label == "spam":
                proxy3.append(item)
            done = True
    V = unique
    N = len(documents)
    print("N:",N)
    LB = len(unique)
    print("THIS IS LB:",LB)
    self.V = V
    print("THIS IS COUNT/NC", count)
    Nc = count
    prior["ham"] = Nc / N
    self.prior = prior
    Nc = len(documents) - count
    print("THIS IS SPAM COUNT/NC", Nc)
    prior["spam"] = Nc / N
    self.prior = prior
    text2 = proxy2
    text3 = proxy3
    TctTotal = len(text2)
    Tc_tTotal = len(text3)
    print("THIS IS TCTOTAL",TctTotal)
    print("THIS IS TC_TTOTAL",Tc_tTotal)
    for term in text2:
        if term not in oldterms:
            Tct[term] = text2.count(term)
            oldterms.append(term)
    for term in text3:
        if term not in old_terms:
            Tc_t[term] = text3.count(term)
            old_terms.append(term)
    for term in V:
        if term in text2:
            condprob[0].update({term: (Tct[term] + 1) / (TctTotal + LB)})
        if term in text3:
            condprob[1].update({term: (Tc_t[term] + 1) / (Tc_tTotal + LB)})
    print("This is condprob", condprob)
    self.condprob = condprob

def classify(self, documents):
    """ Return a list of strings, either 'spam' or 'ham', for each document.
    Params:
      documents....A list of Document objects to be classified.
    Returns:
      A list of label strings corresponding to the predictions for each document.
    """
    ###TODO
    #return list["string1", "string2", "stringn"]
    # docs2 = ham, condprob[0] is ham
    # docs3 = spam, condprob[1] is spam
    unique = []
    ans = []
    hscore = 0
    sscore = 0
    for a in range(len(documents)):
        for item in documents[a].tokens:
            if item not in unique:
                unique.append(item)
        W = unique
        hscore = math.log(float(self.prior['ham']))
        sscore = math.log(float(self.prior['spam']))
        for t in W:
            try:
                hscore += math.log(self.condprob[0][t])
            except KeyError:
                continue
            try:
                sscore += math.log(self.condprob[1][t])
            except KeyError:
                continue
        print("THIS IS SSCORE",sscore)
        print("THIS IS HSCORE",hscore)
        unique = []
        if hscore > sscore:
            str = "Spam"
        elif sscore > hscore:
            str = "Ham"
        ans.append(str)

    return ans

-

if not os.path.exists('train'):  # download data
from urllib.request import urlretrieve
import tarfile

urlretrieve('http://cs.iit.edu/~culotta/cs429/lingspam.tgz', 'lingspam.tgz')
tar = tarfile.open('lingspam.tgz')
tar.extractall()
tar.close()
train_docs = [Document(filename=f) for f in glob.glob("train/*.txt")]
test_docs = [Document(filename=f) for f in glob.glob("test/*.txt")]
test = train_docs

nb = NaiveBayes()
nb.train(train_docs[1500:])
#uncomment when testing classify()
#predictions = nb.classify(test_docs[:200])
#print("PREDICTIONS",predictions)

, , .

, ? , ? , , condprob?

condprob, , :

"": 2.461114392596968e-05, 'fillmore': 2.461114392596968e-05, '796': 2.461114392596968e-05, 'zann': 2.461114392596968e-05

condprob - , , - , - . . "" , .00031235, 3.1235e-05. , condprob , , ,

-2634.5292392650663, SSCORE -1707.983339196181

327.82, 758.80

~ 1 , 30

+4

python algorithm computer-science data-retrieval

Codarus 07 '16 21:19

1

mcdowella · Answer 1 · 2016-05-08T05:24:35+0000

(, , , , , , , , , , , )

, p (Ham) p (Spam), , , p (Ham) * p (Observed data | Ham) p () * p ( | ).

, lA lB. , , , ,

lA = lA - max (lA, lB)

lB = lB - max (lA, lB)

. , , . :

pA = exp (lA)

pB = exp (lB)

,

truePA = pA/(pA + pB)

truePB = pB/(pA + pB)

How to fix my Naive Bayes method returning extremely small conditional probabilities?

More articles: