In [1]:
import sys
from sklearn.svm import SVC
from time import time
sys.path.append("../tools/")
from email_preprocess import preprocess
In [2]:
features_train, features_test, labels_train, labels_test = preprocess()
In [3]:
clf = SVC(kernel="linear")
clf.fit(features_train,labels_train)
Out[3]:
In [4]:
clf.score(features_test,labels_test)
Out[4]:
In [5]:
%time clf.fit(features_train,labels_train)
Out[5]:
And not surprisingly this is much, much slower than something like naive Bayes.
In [6]:
features_train = features_train[:len(features_train)/100]
labels_train = labels_train[:len(labels_train)/100]
clf.fit(features_train,labels_train)
clf.score(features_test,labels_test)
Out[6]:
In [7]:
clf_rbf = SVC(kernel="rbf")
clf_rbf.fit(features_train,labels_train)
Out[7]:
In [8]:
clf_rbf.score(features_test,labels_test)
Out[8]:
I had some weirdness with the grid search functions, which are probably a better method of doing this in general.
In [9]:
clf10 = SVC(C=10.0,kernel="rbf")
clf10.fit(features_train,labels_train)
clf100 = SVC(C=100.0,kernel="rbf")
clf100.fit(features_train,labels_train)
clf1000 = SVC(C=1000.0,kernel="rbf")
clf1000.fit(features_train,labels_train)
clf10000 = SVC(C=10000.0,kernel="rbf")
clf10000.fit(features_train,labels_train)
Out[9]:
In [10]:
print "C = 10: ", clf10.score(features_test,labels_test)
print "C = 100: ", clf100.score(features_test,labels_test)
print "C = 1000: ", clf1000.score(features_test,labels_test)
print "C = 10,000: ", clf10000.score(features_test,labels_test)
In [11]:
features_train, features_test, labels_train, labels_test = preprocess()
clf = SVC(C=10000,kernel="rbf")
clf.fit(features_train,labels_train)
Out[11]:
In [12]:
clf.score(features_test,labels_test)
Out[12]:
In [13]:
pred = clf.predict(features_test)
In [14]:
for i in [10,26,50]:
print 'training point',i,'--predicted:',pred[i],'real value:',labels_test[i]
In [15]:
# Raw count:
chrisCount = sum(pred)
chrisCount
Out[15]:
In [17]:
# Proportion:
chrisCount/float(len(pred))
Out[17]: