In [4]:
from sklearn import svm
%matplotlib inline
In [2]:
X = [[0, 0], [1, 2]]
y = [0, 1]
clf = svm.SVC()
clf.fit(X, y)
clf.predict([[2., 2.]])
Out[2]:
In [3]:
%load scripts/class_vis.py
In [5]:
#!/usr/bin/python
#from udacityplots import *
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
#import numpy as np
#import matplotlib.pyplot as plt
#plt.ioff()
def prettyPicture(clf, X_test, y_test):
x_min = 0.0; x_max = 1.0
y_min = 0.0; y_max = 1.0
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
h = .01 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)
# Plot also the test points
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
#plt.savefig("test.png")
import base64
import json
import subprocess
def output_image(name, format, bytes):
image_start = "BEGIN_IMAGE_f9825uweof8jw9fj4r8"
image_end = "END_IMAGE_0238jfw08fjsiufhw8frs"
data = {}
data['name'] = name
data['format'] = format
data['bytes'] = base64.encodestring(bytes)
print image_start+json.dumps(data)+image_end
In [6]:
%load scripts/prep_terrain_data.py
In [7]:
#!/usr/bin/python
import random
def makeTerrainData(n_points=1000):
###############################################################################
### make the toy dataset
random.seed(42)
grade = [random.random() for ii in range(0,n_points)]
bumpy = [random.random() for ii in range(0,n_points)]
error = [random.random() for ii in range(0,n_points)]
y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
for ii in range(0, len(y)):
if grade[ii]>0.8 or bumpy[ii]>0.8:
y[ii] = 1.0
### split into train/test sets
X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
split = int(0.75*n_points)
X_train = X[0:split]
X_test = X[split:]
y_train = y[0:split]
y_test = y[split:]
grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]
# training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
# , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
, "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
return X_train, y_train, X_test, y_test
# return training_data, test_data
SVM
In [8]:
import sys
#from class_vis import prettyPicture
#from prep_terrain_data import makeTerrainData
import matplotlib.pyplot as plt
import copy
import numpy as np
import pylab as pl
features_train, labels_train, features_test, labels_test = makeTerrainData()
########################## SVM #################################
### we handle the import statement and SVC creation for you here
from sklearn.svm import SVC
clf = SVC(kernel="linear")
#### now your job is to fit the classifier
#### using the training features/labels, and to
#### make a set of predictions on the test data
clf.fit(features_train, labels_train)
#### store your predictions in a list named pred
pred = clf.predict(features_test)
from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)
def submitAccuracy():
return acc
print "Accuracy: ", submitAccuracy()
In [9]:
prettyPicture(clf, features_test, labels_test)
In [10]:
clf2 = SVC(kernel="linear", gamma=1.0)
clf2.fit(features_train, labels_train)
Out[10]:
In [11]:
prettyPicture(clf2, features_test, labels_test)
In [12]:
clf2 = SVC(kernel="rbf", C=100000000)
clf2.fit(features_train, labels_train)
prettyPicture(clf2, features_test, labels_test)
In [16]:
%load ../ud120-projects/svm/svm_author_id.py
In [ ]:
# %%writefile ../ud120-projects/svm/svm_author_id.py
#!/usr/bin/python
"""
this is the code to accompany the Lesson 2 (SVM) mini-project
use an SVM to identify emails from the Enron corpus by their authors
Sara has label 0
Chris has label 1
"""
import sys
from time import time
sys.path.append("../ud120-projects/tools/")
from email_preprocess import preprocess
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()
#########################################################
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
if __name__ == '__main__':
clt = SVC(kernel="linear")
time0 = time()
clt.fit(features_train, labels_train)
print "Training time: ", round(time()-t0, 3), "seconds."
t0 = time()
pred = clt.predict(features_test)
print "Prediction Time: ", round(time()-t0, 3), "seconds."
acc = accuracy_score(pred, labels_test)
print "Accuracy: ", acc
#########################################################
In [26]:
if __name__ == '__main__':
features_train, features_test, labels_train, labels_test = preprocess()
clt = SVC(kernel="linear")
t0 = time()
features_train = features_train[:len(features_train)/100]
labels_train = labels_train[:len(labels_train)/100]
clt.fit(features_train, labels_train)
print "Training time: ", round(time()-t0, 3), "seconds."
t0 = time()
pred = clt.predict(features_test)
print "Prediction Time: ", round(time()-t0, 3), "seconds."
acc = accuracy_score(pred, labels_test)
print "Accuracy: ", acc
In [27]:
if __name__ == '__main__':
features_train, features_test, labels_train, labels_test = preprocess()
clt = SVC(kernel="rbf")
t0 = time()
features_train = features_train[:len(features_train)/100]
labels_train = labels_train[:len(labels_train)/100]
clt.fit(features_train, labels_train)
print "Training time: ", round(time()-t0, 3), "seconds."
t0 = time()
pred = clt.predict(features_test)
print "Prediction Time: ", round(time()-t0, 3), "seconds."
acc = accuracy_score(pred, labels_test)
print "Accuracy: ", acc
In [31]:
if __name__ == '__main__':
features_train, features_test, labels_train, labels_test = preprocess()
clt = SVC(kernel="rbf", C=10000.0)
t0 = time()
features_train = features_train[:len(features_train)/100]
labels_train = labels_train[:len(labels_train)/100]
clt.fit(features_train, labels_train)
print "Training time: ", round(time()-t0, 3), "seconds."
t0 = time()
pred = clt.predict(features_test)
print "Prediction Time: ", round(time()-t0, 3), "seconds."
acc = accuracy_score(pred, labels_test)
print "Accuracy: ", acc
In [32]:
if __name__ == '__main__':
features_train, features_test, labels_train, labels_test = preprocess()
clt = SVC(kernel="rbf", C=10000.0)
t0 = time()
#features_train = features_train[:len(features_train)/100]
#labels_train = labels_train[:len(labels_train)/100]
clt.fit(features_train, labels_train)
print "Training time: ", round(time()-t0, 3), "seconds."
t0 = time()
pred = clt.predict(features_test)
print "Prediction Time: ", round(time()-t0, 3), "seconds."
acc = accuracy_score(pred, labels_test)
print "Accuracy: ", acc
In [34]:
print pred[10]
print pred[26]
print pred[50]
In [38]:
np.bincount(pred)
Out[38]:
In [ ]: