In [12]:
import re
class Tokenizer(object):
def __init__(self, stop_words = [], signs_to_remove = ["?!#%&"]):
self.stop_words = stop_words
self.signs_to_remove = signs_to_remove
def tokenize(self,text):
return text.lower().split(' ')
def remove_stop_words(self,token):
if token in self.stop_words:
return "stop_word"
else:
return token
def remove_punctuation(self,token):
return re.sub(str(self.signs_to_remove),"",token)
In [17]:
"""
Suppose you have some texts of news and know their categories.
You want to train a system with this pre-categorized/pre-classified
texts. So, you have better call this data your training set.
"""
from naiveBayesClassifier.trainer import Trainer
from naiveBayesClassifier.classifier import Classifier
newsTrainer = Trainer(Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))
# You need to train the system passing each text one by one to the trainer module.
newsSet=[
{"text": "free online !!! results free", "category": "spam"},
{"text": "results repository online !!!", "category": "ham"},
{"text": "!!! online paper free !!!", "category": "spam"},
{"text": "!!! conference registration online !!!", "category": "spam"},
{"text": "free call free registration online", "category": "spam"},
{"text": "conference call paper registration conference", "category": "ham"},
{"text": "submission deadline conference call deadline", "category": "ham"},
]
for news in newsSet:
newsTrainer.train(news["text"], news["category"])
# When you have sufficient trained data, you are almost done and can start to use
# a classifier.
newsClassifier = Classifier(newsTrainer.data, Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))
# Now you have a classifier which can give a try to classifiy text of news whose
# category is unknown, yet.
unknownInstances = [
"free submission online !!!",
"conference paper submission deadline",
"results paper call registration submission"
]
for unknownInstance in unknownInstances:
newsClassifier = Classifier(newsTrainer.data, Tokenizer(stop_words = [], signs_to_remove = ["?#%&"]))
classification = newsClassifier.classify(unknownInstance)
# the classification variable holds the possible categories sorted by
# their probablity value
print unknownInstance
print classification
In [15]:
type_count = {}
for entry in newsSet:
text = entry['text']
category = entry['category']
words = text.lower().split(' ')
k = []
for i in words:
j = i.replace(" ",'')
k.append(j)
words = k
words_count = {}
for word in words:
if word not in words_count:
words_count[word] = 1
else:
value = words_count[word]
words_count[word] = value + 1
# print words_count
if category not in type_count:
type_count[category] = words_count
else:
old_words_count = type_count[category]
for word in words_count:
if word not in old_words_count:
old_words_count[word] = words_count[word]
else:
value = old_words_count[word]
old_words_count[word] = value + words_count[word]
type_count[category] = old_words_count
print type_count
In [16]:
'''
[6.0, 5.0, 6.0, 2.0, 6.0],
[6.0, 5.0, 2.0, 6.0, 6.0],
[6.0, 2.0, 3.0, 5.0, 6.0],
[6.0, 2.0, 6.0, 3.0, 5.0],
[2.0, 2.0, 2.0, 3.0, 2.0],
[1.0, 1.0, 2.0, 2.0, 1.0]
'''
# 2, 1, 5, 6
nume_list = [[6.0, 5.0, 6.0, 2.0, 6.0],
[6.0, 5.0, 2.0, 6.0, 6.0],
[6.0, 2.0, 3.0, 5.0, 6.0],
[6.0, 2.0, 6.0, 3.0, 5.0],
[2.0, 2.0, 2.0, 3.0, 2.0],
[1.0, 1.0, 2.0, 2.0, 1.0]]
deno = [31.0, 31.0, 31.0, 31.0, 31.0]
fr_nume = [5.0, 4.0, 3.0, 2.0, 1.0]
fr_deno = [2.0, 1.0, 1.0, 1.0, 1.0]
probability = 5.0/9.0
numerator = 1
denominator = 1
for nume in nume_list:
numerator = 1
denominator = 1
for i in range(0, 5):
numerator *= (nume[i] * fr_nume[i])
denominator *= (deno[i] * fr_deno[i])
print (float(numerator)/float(denominator))*(probability)
print (2.0*1.0*5.0*6.0*4.0*3.0*2.0*5.0)/(9.0*30.0*30.0*30.0*30.0)
In [10]:
"free submission online !!!"
print
"conference paper submission deadline"
"results paper call registration submission"
print (2.0 * 2.0 * 3.0 * 2.0 * 2.0 * 24.0 * 4.0)/(9.0 * 25.0 * 25.0 * 25.0 * 25.0 * 25.0) # ham
print (2.0 * 2.0 * 2.0 * 3.0 * 1.0 * 24.0 * 5.0)/(9.0 * 30.0 * 30.0 * 30.0 * 30.0 * 30.0) # spam
In [23]:
import numpy as np
print "Generating 5 spam documents"
for i in range(0,5):
print np.random.choice(
['free', 'online', '!!!', 'results', 'repository', 'paper', 'call', 'conference', 'registration', 'submission', 'deadline'],
6,
p=[6.0/31.0, 5.0/31.0, 6.0/31.0, 2.0/31.0, 1.0/31.0, 2.0/31.0, 2.0/31.0, 2.0/31.0, 3.0/31.0, 1.0/31.0, 1.0/31.0]
)
print "Generating 5 ham documents"
for i in range(0,5):
print np.random.choice(
['free', 'online', '!!!', 'results', 'repository', 'paper', 'call', 'conference', 'registration', 'submission', 'deadline'],
6,
p=[1.0/25.0, 2.0/25.0, 2.0/25.0, 2.0/25.0, 2.0/25.0, 2.0/25.0, 3.0/25.0, 4.0/25.0, 2.0/25.0, 2.0/25.0, 3.0/25.0]
)
In [ ]: