notebook.community

Edit and run



In [31]:

    
# Ref: https://web2dot5.wordpress.com/2012/03/21/text-classification-in-python/
# Ref: http://blog.yhathq.com/posts/naive-bayes-in-python.html

import re
import os
import string

def remove_punctuation(s):
    "see http://stackoverflow.com/questions/265960/best-way-to-strip-punctuation-from-a-string-in-python"
    table = string.maketrans("","")
    return s.translate(table, string.punctuation)
 
def tokenize(text):
    text = remove_punctuation(text)
    text = text.lower()
    return re.split("\W+", text)
 
def count_words(words):
    wc = {}
    for word in words:
        wc[word] = wc.get(word, 0.0) + 1.0
    return wc



In [32]:

    
# Train

dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
    "spanish": {},
    "english": {},
    "french": {}
}
priors = {
    "spanish": 0.,
    "english": 0.,
    "french": 0.
}
docs = []
for f in os.listdir("data/train"):
    f = f.strip()
    if f.endswith(".csv") == False:
        # skip non .csv files
        continue
    elif "trainSet100Sp.csv" in f:
        category = "spanish"
    elif "trainSet100En.csv" in f:
        category = "english"
    elif "trainSet100Fr.csv" in f:
        category = "french"
    docs.append((category, f))
    priors[category] += 1
    text = open(dataTrainDir + f).read()
    words = tokenize(text)
    counts = count_words(words)
    for word, count in counts.items():
        # if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
        if word not in vocab:
            vocab[word] = 0.0
        if word not in word_counts[category]:
            word_counts[category][word] = 0.0
        vocab[word] += count
        word_counts[category][word] += count



In [33]:

    
# Test

import math

testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)

prior_spanish = (priors["spanish"] / sum(priors.values()))
prior_english = (priors["english"] / sum(priors.values()))
prior_french = (priors["french"] / sum(priors.values()))

log_prob_spanish = 0.0
log_prob_english = 0.0
log_prob_french = 0.0

for w, cnt in counts.items():
    # skip words that we haven't seen before, or words less than 3 letters long
    if not w in vocab or len(w) <= 3:
        continue
    # calculate the probability that the word occurs at all
    p_word = vocab[w] / sum(vocab.values())
    # for all categories, calculate P(word|category), or the probability a 
    # word will appear, given that we know that the document is <category>
    p_w_given_spanish = word_counts["spanish"].get(w, 0.0) / sum(word_counts["spanish"].values())
    p_w_given_english = word_counts["english"].get(w, 0.0) / sum(word_counts["english"].values())
    p_w_given_french = word_counts["french"].get(w, 0.0) / sum(word_counts["french"].values())
    # add new probability to our running total: log_prob_<category>. if the probability 
    # is 0 (i.e. the word never appears for the category), then skip it
    if p_w_given_spanish > 0:
        log_prob_spanish += math.log(cnt * p_w_given_spanish / p_word)
    if p_w_given_english > 0:
        log_prob_english += math.log(cnt * p_w_given_english / p_word)
    if p_w_given_french > 0:
        log_prob_french += math.log(cnt * p_w_given_french / p_word)
 
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(spanish):", math.exp(log_prob_spanish + math.log(prior_spanish))
print "Score(english):", math.exp(log_prob_english + math.log(prior_english))
print "Score(french):", math.exp(log_prob_french + math.log(prior_french))

#testSet100Sp.csv
#Score(spanish): 3.02240985965e+127
#Score(english): 279342099.181
#Score(french): 563.893354333

#testSet100En.csv
#Score(spanish): 5368498.18807
#Score(english): 2.01867210401e+147
#Score(french): 3025960.44281

#testSet100Fr.csv
#Score(spanish): 2737.14909534
#Score(english): 208444199.512
#Score(french): 2.62761729517e+101

#testSet100De.csv
#Score(spanish): 4.66277969722
#Score(english): 6647696.78399
#Score(french): 126402.30399









    



Score(spanish): 4.66277969722
Score(english): 6647696.78399
Score(french): 126402.30399



In [34]:

    
# Any-of

# Spanish

import re
import os
import string

dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
    "spanish": {},
    "nonspanish": {}
}
priors = {
    "spanish": 0.,
    "nonspanish": 0.
}
docs = []
for f in os.listdir("data/train"):
    f = f.strip()
    if f.endswith(".csv") == False:
        # skip non .csv files
        continue
    elif "trainSet100Sp.csv" in f:
        category = "spanish"
    elif "trainSet100En.csv" in f:
        category = "nonspanish"
    elif "trainSet100Fr.csv" in f:
        category = "nonspanish"
    docs.append((category, f))
    priors[category] += 1
    text = open(dataTrainDir + f).read()
    words = tokenize(text)
    counts = count_words(words)
    for word, count in counts.items():
        # if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
        if word not in vocab:
            vocab[word] = 0.0
        if word not in word_counts[category]:
            word_counts[category][word] = 0.0
        vocab[word] += count
        word_counts[category][word] += count

# Test

import math

testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)

prior_spanish = (priors["spanish"] / sum(priors.values()))
prior_nonspanish = (priors["nonspanish"] / sum(priors.values()))

log_prob_spanish = 0.0
log_prob_nonspanish = 0.0

for w, cnt in counts.items():
    # skip words that we haven't seen before, or words less than 3 letters long
    if not w in vocab or len(w) <= 3:
        continue
    # calculate the probability that the word occurs at all
    p_word = vocab[w] / sum(vocab.values())
    # for all categories, calculate P(word|category), or the probability a 
    # word will appear, given that we know that the document is <category>
    p_w_given_spanish = word_counts["spanish"].get(w, 0.0) / sum(word_counts["spanish"].values())
    p_w_given_nonspanish = word_counts["nonspanish"].get(w, 0.0) / sum(word_counts["nonspanish"].values())
    # add new probability to our running total: log_prob_<category>. if the probability 
    # is 0 (i.e. the word never appears for the category), then skip it
    if p_w_given_spanish > 0:
        log_prob_spanish += math.log(cnt * p_w_given_spanish / p_word)
    if p_w_given_nonspanish > 0:
        log_prob_nonspanish += math.log(cnt * p_w_given_nonspanish / p_word)
 
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(spanish):", math.exp(log_prob_spanish + math.log(prior_spanish))
print "Score(nonspanish):", math.exp(log_prob_nonspanish + math.log(prior_nonspanish))

#testSet100Sp.csv
#Score(spanish): 3.02240985965e+127 *****
#Score(nonspanish): 21346.8911115

#testSet100En.csv
#Score(spanish): 5368498.18807
#Score(nonspanish): 3.69915740887e+84 *****

#testSet100Fr.csv
#Score(spanish): 2737.14909534
#Score(nonspanish): 2.1772360453e+60 *****

#testSet100De.csv
#Score(spanish): 4.66277969722
#Score(nonspanish): 150759.068523 *****









    



Score(spanish): 4.66277969722
Score(nonspanish): 150759.068523



In [38]:

    
# Any-of

# English

import re
import os
import string

dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
    "english": {},
    "nonenglish": {}
}
priors = {
    "english": 0.,
    "nonenglish": 0.
}
docs = []
for f in os.listdir("data/train"):
    f = f.strip()
    if f.endswith(".csv") == False:
        # skip non .csv files
        continue
    elif "trainSet100Sp.csv" in f:
        category = "nonenglish"
    elif "trainSet100En.csv" in f:
        category = "english"
    elif "trainSet100Fr.csv" in f:
        category = "nonenglish"
    docs.append((category, f))
    priors[category] += 1
    text = open(dataTrainDir + f).read()
    words = tokenize(text)
    counts = count_words(words)
    for word, count in counts.items():
        # if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
        if word not in vocab:
            vocab[word] = 0.0
        if word not in word_counts[category]:
            word_counts[category][word] = 0.0
        vocab[word] += count
        word_counts[category][word] += count

# Test

import math

testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)

prior_english = (priors["english"] / sum(priors.values()))
prior_nonenglish = (priors["nonenglish"] / sum(priors.values()))

log_prob_english = 0.0
log_prob_nonenglish = 0.0

for w, cnt in counts.items():
    # skip words that we haven't seen before, or words less than 3 letters long
    if not w in vocab or len(w) <= 3:
        continue
    # calculate the probability that the word occurs at all
    p_word = vocab[w] / sum(vocab.values())
    # for all categories, calculate P(word|category), or the probability a 
    # word will appear, given that we know that the document is <category>
    p_w_given_english = word_counts["english"].get(w, 0.0) / sum(word_counts["english"].values())
    p_w_given_nonenglish = word_counts["nonenglish"].get(w, 0.0) / sum(word_counts["nonenglish"].values())
    # add new probability to our running total: log_prob_<category>. if the probability 
    # is 0 (i.e. the word never appears for the category), then skip it
    if p_w_given_english > 0:
        log_prob_english += math.log(cnt * p_w_given_english / p_word)
    if p_w_given_nonenglish > 0:
        log_prob_nonenglish += math.log(cnt * p_w_given_nonenglish / p_word)
 
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(english):", math.exp(log_prob_english + math.log(prior_english))
print "Score(nonenglish):", math.exp(log_prob_nonenglish + math.log(prior_nonenglish))

#testSet100Sp.csv
#Score(english): 279342099.181
#Score(nonenglish): 5.85760568568e+69 *****

#testSet100En.csv
#Score(english): 2.01867210401e+147 *****
#Score(nonenglish): 971394.688083

#testSet100Fr.csv
#Score(english): 208444199.512
#Score(nonenglish): 7.03242647119e+57 *****

#testSet100De.csv
#Score(english): 6647696.78399 *****
#Score(nonenglish): 335.305742197









    



Score(english): 6647696.78399
Score(nonenglish): 335.305742197



In [42]:

    
# Any-of

# French

import re
import os
import string

dataTrainDir = "data/train/"
# setup some structures to store our data
vocab = {}
word_counts = {
    "french": {},
    "nonfrench": {}
}
priors = {
    "french": 0.,
    "nonfrench": 0.
}
docs = []
for f in os.listdir("data/train"):
    f = f.strip()
    if f.endswith(".csv") == False:
        # skip non .csv files
        continue
    elif "trainSet100Sp.csv" in f:
        category = "nonfrench"
    elif "trainSet100En.csv" in f:
        category = "nonfrench"
    elif "trainSet100Fr.csv" in f:
        category = "french"
    docs.append((category, f))
    priors[category] += 1
    text = open(dataTrainDir + f).read()
    words = tokenize(text)
    counts = count_words(words)
    for word, count in counts.items():
        # if we haven't seen a word yet, let's add it to our dictionaries with a count of 0
        if word not in vocab:
            vocab[word] = 0.0
        if word not in word_counts[category]:
            word_counts[category][word] = 0.0
        vocab[word] += count
        word_counts[category][word] += count

# Test

import math

testDir = "data/test/"
#testDoc = open(testDir + "testSet100Sp.csv").read()
#testDoc = open(testDir + "testSet100En.csv").read()
#testDoc = open(testDir + "testSet100Fr.csv").read()
testDoc = open(testDir + "testSet100De.csv").read()
words = tokenize(testDoc)
counts = count_words(words)

prior_french = (priors["french"] / sum(priors.values()))
prior_nonfrench = (priors["nonfrench"] / sum(priors.values()))

log_prob_french = 0.0
log_prob_nonfrench = 0.0

for w, cnt in counts.items():
    # skip words that we haven't seen before, or words less than 3 letters long
    if not w in vocab or len(w) <= 3:
        continue
    # calculate the probability that the word occurs at all
    p_word = vocab[w] / sum(vocab.values())
    # for all categories, calculate P(word|category), or the probability a 
    # word will appear, given that we know that the document is <category>
    p_w_given_french = word_counts["french"].get(w, 0.0) / sum(word_counts["french"].values())
    p_w_given_nonfrench = word_counts["nonfrench"].get(w, 0.0) / sum(word_counts["nonfrench"].values())
    # add new probability to our running total: log_prob_<category>. if the probability 
    # is 0 (i.e. the word never appears for the category), then skip it
    if p_w_given_french > 0:
        log_prob_french += math.log(cnt * p_w_given_french / p_word)
    if p_w_given_nonfrench > 0:
        log_prob_nonfrench += math.log(cnt * p_w_given_nonfrench / p_word)
 
# print out the results; we need to go from logspace back to "regular" space,
# so we take the EXP of the log_prob
print "Score(french):", math.exp(log_prob_french + math.log(prior_french))
print "Score(nonfrench):", math.exp(log_prob_nonfrench + math.log(prior_nonfrench))

#testSet100Sp.csv
#Score(french): 563.893354333
#Score(nonfrench): 2.88230939706e+72 *****

#testSet100En.csv
#Score(french): 3025960.44281
#Score(nonfrench): 2.36698799195e+85 *****

#testSet100Fr.csv
#Score(french): 2.62761729517e+101 *****
#Score(nonfrench): 29939.2573137

#testSet100De.csv
#Score(french): 126402.30399 *****
#Score(nonfrench): 1358.78830053









    



Score(french): 126402.30399
Score(nonfrench): 1358.78830053



In [ ]:

    
# One-of

#--------------------------------------------------------------
# testSet100Sp.csv

# Spanish classifier
#Score(spanish): 3.02240985965e+127 *****
#Score(nonspanish): 21346.8911115

# English classifier
#Score(english): 279342099.181
#Score(nonenglish): 5.85760568568e+69

# French classifier
#Score(french): 563.893354333
#Score(nonfrench): 2.88230939706e+72
#--------------------------------------------------------------
#--------------------------------------------------------------
# testSet100En.csv

# Spanish classifier
#Score(spanish): 5368498.18807
#Score(nonspanish): 3.69915740887e+84

# English classifier
#Score(english): 2.01867210401e+147 *****
#Score(nonenglish): 971394.688083

# French classifier
#Score(french): 3025960.44281
#Score(nonfrench): 2.36698799195e+85
#--------------------------------------------------------------
#--------------------------------------------------------------
#testSet100Fr.csv

# Spanish classifier
#Score(spanish): 2737.14909534
#Score(nonspanish): 2.1772360453e+60

# English classifier
#Score(english): 208444199.512
#Score(nonenglish): 7.03242647119e+57

# French classifier
#testSet100Fr.csv
#Score(french): 2.62761729517e+101 *****
#Score(nonfrench): 29939.2573137
#--------------------------------------------------------------
#--------------------------------------------------------------
#testSet100De.csv

# Spanish classifier
#Score(spanish): 4.66277969722
#Score(nonspanish): 150759.068523

# English classifier
#Score(english): 6647696.78399 *****
#Score(nonenglish): 335.305742197

# French classifier
#Score(french): 126402.30399
#Score(nonfrench): 1358.78830053