In [12]:
import re

class Tokenizer(object):
    def __init__(self, stop_words = [], signs_to_remove = ["?!#%&"]):
        self.stop_words = stop_words
        self.signs_to_remove = signs_to_remove

    def tokenize(self,text):
        return text.lower().split(' ')

    def remove_stop_words(self,token):
        if token in self.stop_words:
            return "stop_word"
        else:
            return token

    def remove_punctuation(self,token):
        return re.sub(str(self.signs_to_remove),"",token)

In [17]:
"""
Suppose you have some texts of news and know their categories.
You want to train a system with this pre-categorized/pre-classified 
texts. So, you have better call this data your training set.
"""

from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier

newsTrainer = Trainer(Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))

# You need to train the system passing each text one by one to the trainer module.
newsSet=[
    {"text": "free online !!! results free", "category": "spam"},
    {"text": "results repository online !!!", "category": "ham"},
    {"text": "!!! online paper free !!!", "category": "spam"},
    {"text": "!!! conference registration online !!!", "category": "spam"},
    {"text": "free call free registration online", "category": "spam"},
    {"text": "conference call paper registration conference", "category": "ham"},
    {"text": "submission deadline conference call deadline", "category": "ham"},
]

for news in newsSet:
    newsTrainer.train(news["text"], news["category"])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
unknownInstances = [
    "free submission online !!!",
    "conference paper submission deadline",
    "results paper call registration submission"
]
for unknownInstance in unknownInstances:
    newsClassifier = Classifier(newsTrainer.data, Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))
    classification = newsClassifier.classify(unknownInstance)
    # the classification variable holds the possible categories sorted by 
    # their probablity value
    print unknownInstance
    print classification


free submission online !!!
[('spam', 2.734375e-09), ('ham', 8.641975308641975e-11)]
conference paper submission deadline
[('ham', 0.1728395061728395), ('spam', 1.0937500000000002e-19)]
results paper call registration submission
[('ham', 0.019204389574759943), ('spam', 1.3671875000000001e-11)]

In [15]:
type_count = {}
for entry in newsSet:
    text = entry['text']
    category = entry['category']
    words = text.lower().split(' ')
    k = []
    for i in words:
        j = i.replace(" ",'')
        k.append(j)
    words = k
    words_count = {}
    for word in words:
        if word not in words_count:
            words_count[word] = 1
        else:
            value = words_count[word]
            words_count[word] = value + 1
    # print words_count
    if category not in type_count:       
        type_count[category] = words_count
    else:
        old_words_count = type_count[category]
        for word in words_count:
            if word not in old_words_count:
                old_words_count[word] = words_count[word]
            else:
                value = old_words_count[word]
                old_words_count[word] = value + words_count[word]
        type_count[category] = old_words_count
print type_count


{'ham': {'conference': 3, '!!!': 1, 'submission': 1, 'repository': 1, 'registration': 1, 'results': 1, 'paper': 1, 'call': 2, 'online': 1, 'deadline': 2}, 'spam': {'conference': 1, '!!!': 5, 'registration': 2, 'results': 1, 'free': 5, 'paper': 1, 'call': 1, 'online': 4}}

In [16]:
'''
[6.0, 5.0, 6.0, 2.0, 6.0],
[6.0, 5.0, 2.0, 6.0, 6.0],
[6.0, 2.0, 3.0, 5.0, 6.0],
[6.0, 2.0, 6.0, 3.0, 5.0],
[2.0, 2.0, 2.0, 3.0, 2.0],
[1.0, 1.0, 2.0, 2.0, 1.0]
'''
# 2, 1, 5, 6
nume_list = [[6.0, 5.0, 6.0, 2.0, 6.0],
[6.0, 5.0, 2.0, 6.0, 6.0],
[6.0, 2.0, 3.0, 5.0, 6.0],
[6.0, 2.0, 6.0, 3.0, 5.0],
[2.0, 2.0, 2.0, 3.0, 2.0],
[1.0, 1.0, 2.0, 2.0, 1.0]]
deno = [31.0, 31.0, 31.0, 31.0, 31.0]
fr_nume = [5.0, 4.0, 3.0, 2.0, 1.0]
fr_deno = [2.0, 1.0, 1.0, 1.0, 1.0]
probability = 5.0/9.0
numerator = 1
denominator = 1
for nume in nume_list:
    numerator = 1
    denominator = 1
    for i in range(0, 5):
        numerator *= (nume[i] * fr_nume[i])
        denominator *= (deno[i] * fr_deno[i])
    print (float(numerator)/float(denominator))*(probability)

print (2.0*1.0*5.0*6.0*4.0*3.0*2.0*5.0)/(9.0*30.0*30.0*30.0*30.0)


0.00251491914657
0.00251491914657
0.00125745957329
0.00125745957329
5.5887092146e-05
4.65725767884e-06
0.000987654320988

In [10]:
"free submission online !!!"
print 

"conference paper submission deadline"

"results paper call registration submission"
print (2.0 * 2.0 * 3.0 * 2.0 * 2.0 * 24.0 * 4.0)/(9.0 * 25.0 * 25.0 * 25.0 * 25.0 * 25.0) # ham
print (2.0 * 2.0 * 2.0 * 3.0 * 1.0 * 24.0 * 5.0)/(9.0 * 30.0 * 30.0 * 30.0 * 30.0 * 30.0) # spam


5.24288e-05
1.31687242798e-05

In [23]:
import numpy as np
print "Generating 5 spam documents"
for i in range(0,5):
    print np.random.choice(
      ['free', 'online', '!!!', 'results', 'repository', 'paper', 'call', 'conference', 'registration', 'submission', 'deadline'], 
      6,
      p=[6.0/31.0, 5.0/31.0, 6.0/31.0, 2.0/31.0, 1.0/31.0, 2.0/31.0, 2.0/31.0, 2.0/31.0, 3.0/31.0, 1.0/31.0, 1.0/31.0]
    )
    
print "Generating 5 ham documents"
for i in range(0,5):
    print np.random.choice(
      ['free', 'online', '!!!', 'results', 'repository', 'paper', 'call', 'conference', 'registration', 'submission', 'deadline'], 
      6,
      p=[1.0/25.0, 2.0/25.0, 2.0/25.0, 2.0/25.0, 2.0/25.0, 2.0/25.0, 3.0/25.0, 4.0/25.0, 2.0/25.0, 2.0/25.0, 3.0/25.0]
    )


Generating 5 spam documents
['online' 'conference' 'call' 'online' '!!!' '!!!']
['free' 'free' 'call' '!!!' 'free' 'results']
['!!!' 'repository' 'free' 'paper' 'paper' 'submission']
['call' 'free' 'results' 'conference' 'online' 'free']
['!!!' '!!!' 'free' 'registration' 'online' 'free']
Generating 5 ham documents
['conference' 'online' 'paper' '!!!' '!!!' 'call']
['call' 'paper' 'submission' 'repository' 'call' 'submission']
['conference' 'results' 'call' 'repository' 'submission' 'conference']
['call' 'submission' 'deadline' 'free' 'results' 'free']
['call' 'deadline' 'results' 'conference' 'call' 'results']

In [ ]: