In [1]:
import re
class Tokenizer(object):
def __init__(self, stop_words = [], signs_to_remove = ["?!#%&"]):
self.stop_words = stop_words
self.signs_to_remove = signs_to_remove
def tokenize(self,text):
return text.lower().split(' ')
def remove_stop_words(self,token):
if token in self.stop_words:
return "stop_word"
else:
return token
def remove_punctuation(self,token):
return re.sub(str(self.signs_to_remove),"",token)
In [2]:
"""
Suppose you have some texts of news and know their categories.
You want to train a system with this pre-categorized/pre-classified
texts. So, you have better call this data your training set.
"""
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier
newsTrainer = Trainer(Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))
# You need to train the system passing each text one by one to the trainer module.
newsSet=[
{"text": "free online !!! results free", "category": "spam"},
{"text": "results repository online !!!", "category": "ham"},
{"text": "!!! online paper free", "category": "spam"},
{"text": "!!! conference registration online !!!", "category": "spam"},
{"text": "free call free registration online", "category": "spam"},
{"text": "conference call paper registration conference", "category": "ham"},
{"text": "submission deadline conference call deadline", "category": "ham"},
]
for news in newsSet:
newsTrainer.train(news["text"], news["category"])
# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))
# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
unknownInstances = [
"free submission online !!!",
"conference paper submission deadline"
]
for unknownInstance in unknownInstances:
classification = newsClassifier.classify(unknownInstance)
# the classification variable holds the possible categories sorted by
# their probablity value
print unknownInstance
print classification
In [13]:
type_count = {}
for entry in newsSet:
text = entry['text']
category = entry['category']
words = text.lower().split(' ')
k = []
for i in words:
j = i.replace(" ",'')
k.append(j)
words = k
words_count = {}
for word in words:
if word not in words_count:
words_count[word] = 1
else:
value = words_count[word]
words_count[word] = value + 1
# print words_count
if category not in type_count:
type_count[category] = words_count
else:
old_words_count = type_count[category]
for word in words_count:
if word not in old_words_count:
old_words_count[word] = words_count[word]
else:
value = old_words_count[word]
old_words_count[word] = value + words_count[word]
type_count[category] = old_words_count
print type_count
In [ ]: