In [1]:
import re

class Tokenizer(object):
    def __init__(self, stop_words = [], signs_to_remove = ["?!#%&"]):
        self.stop_words = stop_words
        self.signs_to_remove = signs_to_remove

    def tokenize(self,text):
        return text.lower().split(' ')

    def remove_stop_words(self,token):
        if token in self.stop_words:
            return "stop_word"
        else:
            return token

    def remove_punctuation(self,token):
        return re.sub(str(self.signs_to_remove),"",token)

In [2]:
"""
Suppose you have some texts of news and know their categories.
You want to train a system with this pre-categorized/pre-classified 
texts. So, you have better call this data your training set.
"""

from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier

newsTrainer = Trainer(Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))

# You need to train the system passing each text one by one to the trainer module.
newsSet=[
    {"text": "free online !!! results free", "category": "spam"},
    {"text": "results repository online !!!", "category": "ham"},
    {"text": "!!! online paper free", "category": "spam"},
    {"text": "!!! conference registration online !!!", "category": "spam"},
    {"text": "free call free registration online", "category": "spam"},
    {"text": "conference call paper registration conference", "category": "ham"},
    {"text": "submission deadline conference call deadline", "category": "ham"},
]

for news in newsSet:
    newsTrainer.train(news["text"], news["category"])

# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))

# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
unknownInstances = [
    "free submission online !!!",
    "conference paper submission deadline"
]
for unknownInstance in unknownInstances:
    classification = newsClassifier.classify(unknownInstance)
    # the classification variable holds the possible categories sorted by 
    # their probablity value
    print unknownInstance
    print classification


free submission online !!!
[('spam', 2.1875e-09), ('ham', 8.641975308641975e-11)]
conference paper submission deadline
[('ham', 0.1728395061728395), ('spam', 1.0937500000000002e-19)]

In [13]:
type_count = {}
for entry in newsSet:
    text = entry['text']
    category = entry['category']
    words = text.lower().split(' ')
    k = []
    for i in words:
        j = i.replace(" ",'')
        k.append(j)
    words = k
    words_count = {}
    for word in words:
        if word not in words_count:
            words_count[word] = 1
        else:
            value = words_count[word]
            words_count[word] = value + 1
    # print words_count
    if category not in type_count:       
        type_count[category] = words_count
    else:
        old_words_count = type_count[category]
        for word in words_count:
            if word not in old_words_count:
                old_words_count[word] = words_count[word]
            else:
                value = old_words_count[word]
                old_words_count[word] = value + words_count[word]
        type_count[category] = old_words_count
print type_count


{'ham': {'conference': 3, '!!!': 1, 'submission': 1, 'repository': 1, 'registration': 1, 'results': 1, 'paper': 1, 'call': 2, 'online': 1, 'deadline': 2}, 'spam': {'conference': 1, '!!!': 4, 'registration': 2, 'results': 1, 'free': 5, 'paper': 1, 'call': 1, 'online': 4}}

In [ ]: