In [31]:
# Ref: https://web2dot5.wordpress.com/2012/03/21/text-classification-in-python/
# Ref: http://blog.yhathq.com/posts/naive-bayes-in-python.html
import re
import os
import string
def remove_punctuation(s):
"see http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python"
table = string.maketrans("","")
return s.translate(table, string.punctuation)
def tokenize(text):
text = remove_punctuation(text)
text = text.lower()
return re.split("\W+", text)
def count_words(words):
wc = {}
for word in words:
wc[word] = wc.get(word, 0.0) + 1.0
return wc
In [32]:
# Train
dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
"spanish": {},
"english": {},
"french": {}
}
priors = {
"spanish": 0.,
"english": 0.,
"french": 0.
}
docs = []
for f in os.listdir("data/train"):
f = f.strip()
if f.endswith(".csv") == False:
# skip non .csv files
continue
elif "trainSet100Sp.csv" in f:
category = "spanish"
elif "trainSet100En.csv" in f:
category = "english"
elif "trainSet100Fr.csv" in f:
category = "french"
docs.append((category, f))
priors[category] += 1
text = open(dataTrainDir + f).read()
words = tokenize(text)
counts = count_words(words)
for word, count in counts.items():
# if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
if word not in vocab:
vocab[word] = 0.0
if word not in word_counts[category]:
word_counts[category][word] = 0.0
vocab[word] += count
word_counts[category][word] += count
In [33]:
# Test
import math
testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)
prior_spanish = (priors["spanish"] / sum(priors.values()))
prior_english = (priors["english"] / sum(priors.values()))
prior_french = (priors["french"] / sum(priors.values()))
log_prob_spanish = 0.0
log_prob_english = 0.0
log_prob_french = 0.0
for w, cnt in counts.items():
# skip words that we haven't seen before, or words less than 3 letters long
if not w in vocab or len(w) <= 3:
continue
# calculate the probability that the word occurs at all
p_word = vocab[w] / sum(vocab.values())
# for all categories, calculate P(word|category), or the probability a
# word will appear, given that we know that the document is <category>
p_w_given_spanish = word_counts["spanish"].get(w, 0.0) / sum(word_counts["spanish"].values())
p_w_given_english = word_counts["english"].get(w, 0.0) / sum(word_counts["english"].values())
p_w_given_french = word_counts["french"].get(w, 0.0) / sum(word_counts["french"].values())
# add new probability to our running total: log_prob_<category>. if the probability
# is 0 (i.e. the word never appears for the category), then skip it
if p_w_given_spanish > 0:
log_prob_spanish += math.log(cnt * p_w_given_spanish / p_word)
if p_w_given_english > 0:
log_prob_english += math.log(cnt * p_w_given_english / p_word)
if p_w_given_french > 0:
log_prob_french += math.log(cnt * p_w_given_french / p_word)
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(spanish):", math.exp(log_prob_spanish + math.log(prior_spanish))
print "Score(english):", math.exp(log_prob_english + math.log(prior_english))
print "Score(french):", math.exp(log_prob_french + math.log(prior_french))
#testSet100Sp.csv
#Score(spanish): 3.02240985965e+127
#Score(english): 279342099.181
#Score(french): 563.893354333
#testSet100En.csv
#Score(spanish): 5368498.18807
#Score(english): 2.01867210401e+147
#Score(french): 3025960.44281
#testSet100Fr.csv
#Score(spanish): 2737.14909534
#Score(english): 208444199.512
#Score(french): 2.62761729517e+101
#testSet100De.csv
#Score(spanish): 4.66277969722
#Score(english): 6647696.78399
#Score(french): 126402.30399
In [34]:
# Any-of
# Spanish
import re
import os
import string
dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
"spanish": {},
"nonspanish": {}
}
priors = {
"spanish": 0.,
"nonspanish": 0.
}
docs = []
for f in os.listdir("data/train"):
f = f.strip()
if f.endswith(".csv") == False:
# skip non .csv files
continue
elif "trainSet100Sp.csv" in f:
category = "spanish"
elif "trainSet100En.csv" in f:
category = "nonspanish"
elif "trainSet100Fr.csv" in f:
category = "nonspanish"
docs.append((category, f))
priors[category] += 1
text = open(dataTrainDir + f).read()
words = tokenize(text)
counts = count_words(words)
for word, count in counts.items():
# if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
if word not in vocab:
vocab[word] = 0.0
if word not in word_counts[category]:
word_counts[category][word] = 0.0
vocab[word] += count
word_counts[category][word] += count
# Test
import math
testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)
prior_spanish = (priors["spanish"] / sum(priors.values()))
prior_nonspanish = (priors["nonspanish"] / sum(priors.values()))
log_prob_spanish = 0.0
log_prob_nonspanish = 0.0
for w, cnt in counts.items():
# skip words that we haven't seen before, or words less than 3 letters long
if not w in vocab or len(w) <= 3:
continue
# calculate the probability that the word occurs at all
p_word = vocab[w] / sum(vocab.values())
# for all categories, calculate P(word|category), or the probability a
# word will appear, given that we know that the document is <category>
p_w_given_spanish = word_counts["spanish"].get(w, 0.0) / sum(word_counts["spanish"].values())
p_w_given_nonspanish = word_counts["nonspanish"].get(w, 0.0) / sum(word_counts["nonspanish"].values())
# add new probability to our running total: log_prob_<category>. if the probability
# is 0 (i.e. the word never appears for the category), then skip it
if p_w_given_spanish > 0:
log_prob_spanish += math.log(cnt * p_w_given_spanish / p_word)
if p_w_given_nonspanish > 0:
log_prob_nonspanish += math.log(cnt * p_w_given_nonspanish / p_word)
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(spanish):", math.exp(log_prob_spanish + math.log(prior_spanish))
print "Score(nonspanish):", math.exp(log_prob_nonspanish + math.log(prior_nonspanish))
#testSet100Sp.csv
#Score(spanish): 3.02240985965e+127 *****
#Score(nonspanish): 21346.8911115
#testSet100En.csv
#Score(spanish): 5368498.18807
#Score(nonspanish): 3.69915740887e+84 *****
#testSet100Fr.csv
#Score(spanish): 2737.14909534
#Score(nonspanish): 2.1772360453e+60 *****
#testSet100De.csv
#Score(spanish): 4.66277969722
#Score(nonspanish): 150759.068523 *****
In [38]:
# Any-of
# English
import re
import os
import string
dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
"english": {},
"nonenglish": {}
}
priors = {
"english": 0.,
"nonenglish": 0.
}
docs = []
for f in os.listdir("data/train"):
f = f.strip()
if f.endswith(".csv") == False:
# skip non .csv files
continue
elif "trainSet100Sp.csv" in f:
category = "nonenglish"
elif "trainSet100En.csv" in f:
category = "english"
elif "trainSet100Fr.csv" in f:
category = "nonenglish"
docs.append((category, f))
priors[category] += 1
text = open(dataTrainDir + f).read()
words = tokenize(text)
counts = count_words(words)
for word, count in counts.items():
# if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
if word not in vocab:
vocab[word] = 0.0
if word not in word_counts[category]:
word_counts[category][word] = 0.0
vocab[word] += count
word_counts[category][word] += count
# Test
import math
testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)
prior_english = (priors["english"] / sum(priors.values()))
prior_nonenglish = (priors["nonenglish"] / sum(priors.values()))
log_prob_english = 0.0
log_prob_nonenglish = 0.0
for w, cnt in counts.items():
# skip words that we haven't seen before, or words less than 3 letters long
if not w in vocab or len(w) <= 3:
continue
# calculate the probability that the word occurs at all
p_word = vocab[w] / sum(vocab.values())
# for all categories, calculate P(word|category), or the probability a
# word will appear, given that we know that the document is <category>
p_w_given_english = word_counts["english"].get(w, 0.0) / sum(word_counts["english"].values())
p_w_given_nonenglish = word_counts["nonenglish"].get(w, 0.0) / sum(word_counts["nonenglish"].values())
# add new probability to our running total: log_prob_<category>. if the probability
# is 0 (i.e. the word never appears for the category), then skip it
if p_w_given_english > 0:
log_prob_english += math.log(cnt * p_w_given_english / p_word)
if p_w_given_nonenglish > 0:
log_prob_nonenglish += math.log(cnt * p_w_given_nonenglish / p_word)
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(english):", math.exp(log_prob_english + math.log(prior_english))
print "Score(nonenglish):", math.exp(log_prob_nonenglish + math.log(prior_nonenglish))
#testSet100Sp.csv
#Score(english): 279342099.181
#Score(nonenglish): 5.85760568568e+69 *****
#testSet100En.csv
#Score(english): 2.01867210401e+147 *****
#Score(nonenglish): 971394.688083
#testSet100Fr.csv
#Score(english): 208444199.512
#Score(nonenglish): 7.03242647119e+57 *****
#testSet100De.csv
#Score(english): 6647696.78399 *****
#Score(nonenglish): 335.305742197
In [42]:
# Any-of
# French
import re
import os
import string
dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
"french": {},
"nonfrench": {}
}
priors = {
"french": 0.,
"nonfrench": 0.
}
docs = []
for f in os.listdir("data/train"):
f = f.strip()
if f.endswith(".csv") == False:
# skip non .csv files
continue
elif "trainSet100Sp.csv" in f:
category = "nonfrench"
elif "trainSet100En.csv" in f:
category = "nonfrench"
elif "trainSet100Fr.csv" in f:
category = "french"
docs.append((category, f))
priors[category] += 1
text = open(dataTrainDir + f).read()
words = tokenize(text)
counts = count_words(words)
for word, count in counts.items():
# if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
if word not in vocab:
vocab[word] = 0.0
if word not in word_counts[category]:
word_counts[category][word] = 0.0
vocab[word] += count
word_counts[category][word] += count
# Test
import math
testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)
prior_french = (priors["french"] / sum(priors.values()))
prior_nonfrench = (priors["nonfrench"] / sum(priors.values()))
log_prob_french = 0.0
log_prob_nonfrench = 0.0
for w, cnt in counts.items():
# skip words that we haven't seen before, or words less than 3 letters long
if not w in vocab or len(w) <= 3:
continue
# calculate the probability that the word occurs at all
p_word = vocab[w] / sum(vocab.values())
# for all categories, calculate P(word|category), or the probability a
# word will appear, given that we know that the document is <category>
p_w_given_french = word_counts["french"].get(w, 0.0) / sum(word_counts["french"].values())
p_w_given_nonfrench = word_counts["nonfrench"].get(w, 0.0) / sum(word_counts["nonfrench"].values())
# add new probability to our running total: log_prob_<category>. if the probability
# is 0 (i.e. the word never appears for the category), then skip it
if p_w_given_french > 0:
log_prob_french += math.log(cnt * p_w_given_french / p_word)
if p_w_given_nonfrench > 0:
log_prob_nonfrench += math.log(cnt * p_w_given_nonfrench / p_word)
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(french):", math.exp(log_prob_french + math.log(prior_french))
print "Score(nonfrench):", math.exp(log_prob_nonfrench + math.log(prior_nonfrench))
#testSet100Sp.csv
#Score(french): 563.893354333
#Score(nonfrench): 2.88230939706e+72 *****
#testSet100En.csv
#Score(french): 3025960.44281
#Score(nonfrench): 2.36698799195e+85 *****
#testSet100Fr.csv
#Score(french): 2.62761729517e+101 *****
#Score(nonfrench): 29939.2573137
#testSet100De.csv
#Score(french): 126402.30399 *****
#Score(nonfrench): 1358.78830053
In [ ]:
# One-of
#--------------------------------------------------------------
# testSet100Sp.csv
# Spanish classifier
#Score(spanish): 3.02240985965e+127 *****
#Score(nonspanish): 21346.8911115
# English classifier
#Score(english): 279342099.181
#Score(nonenglish): 5.85760568568e+69
# French classifier
#Score(french): 563.893354333
#Score(nonfrench): 2.88230939706e+72
#--------------------------------------------------------------
#--------------------------------------------------------------
# testSet100En.csv
# Spanish classifier
#Score(spanish): 5368498.18807
#Score(nonspanish): 3.69915740887e+84
# English classifier
#Score(english): 2.01867210401e+147 *****
#Score(nonenglish): 971394.688083
# French classifier
#Score(french): 3025960.44281
#Score(nonfrench): 2.36698799195e+85
#--------------------------------------------------------------
#--------------------------------------------------------------
#testSet100Fr.csv
# Spanish classifier
#Score(spanish): 2737.14909534
#Score(nonspanish): 2.1772360453e+60
# English classifier
#Score(english): 208444199.512
#Score(nonenglish): 7.03242647119e+57
# French classifier
#testSet100Fr.csv
#Score(french): 2.62761729517e+101 *****
#Score(nonfrench): 29939.2573137
#--------------------------------------------------------------
#--------------------------------------------------------------
#testSet100De.csv
# Spanish classifier
#Score(spanish): 4.66277969722
#Score(nonspanish): 150759.068523
# English classifier
#Score(english): 6647696.78399 *****
#Score(nonenglish): 335.305742197
# French classifier
#Score(french): 126402.30399
#Score(nonfrench): 1358.78830053