In [1]:
#NAIVE BAYES

import sys
from time import time
sys.path.append("tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()




#########################################################
### your code goes here ###

from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score

clf = GaussianNB()
t0 = time()
clf.fit(features_train, labels_train)
print "training time: ", round(time()-t0, 3), "s"

t1 = time()
pred = clf.predict(features_test)
print "predicting time: ", round(time()-t1, 3), "s"

accuracy = accuracy_score(labels_test, pred)

print accuracy


/home/eduardo/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)
no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time:  0.697 s
predicting time:  0.095 s
0.973265073948

In [2]:
#SVM

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()




#########################################################
### your code goes here ###
from sklearn import svm
from sklearn.metrics import accuracy_score

linear_kernel_svm = svm.SVC(kernel='rbf', C=10000.)

features_train = features_train[:len(features_train)/100]
labels_train = labels_train[:len(labels_train)/100]

t0 = time()
linear_kernel_svm.fit(features_train, labels_train)
print "training time with SVM's linear kernel", time() - t0

t1 = time()
pred = linear_kernel_svm.predict(features_test)
print "prediction time with SVM's linear kernel", time() - t1

acc = accuracy_score(labels_test, pred)
print acc

#########################################################

def time_with_power(power, people,times):
    results = nd.random.power(power, people)
    for i in range(times):
            results += nd.random.power(power, 1000)
    return results


no. of Chris training emails: 7936
no. of Sara training emails: 7884
training time with SVM's linear kernel 0.13868188858
prediction time with SVM's linear kernel 0.8358938694
0.892491467577

In [ ]:
#DECISION TREE

import sys
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()

print "Size of features matrix: ", features_train.shape


#########################################################
### your code goes here ###

from sklearn import tree
from sklearn.metrics import accuracy_score

clf = tree.DecisionTreeClassifier(min_samples_split=40)

clf.fit(features_train, labels_train)
pred = clf.predict(features_test)

acc = accuracy_score(labels_test, pred)
print "Accuracy: ", acc
#########################################################


no. of Chris training emails: 7936
no. of Sara training emails: 7884
Size of features matrix:  (15820, 3785)