notebook.community

Edit and run



In [4]:

    
from sklearn import svm
%matplotlib inline



In [2]:

    
X = [[0, 0], [1, 2]]
y = [0, 1]

clf = svm.SVC()

clf.fit(X, y)

clf.predict([[2., 2.]])









    Out[2]:





array([1])



In [3]:

    
%load scripts/class_vis.py



In [5]:

    
#!/usr/bin/python

#from udacityplots import *
import matplotlib 
matplotlib.use('agg')

import matplotlib.pyplot as plt
import pylab as pl
import numpy as np

#import numpy as np
#import matplotlib.pyplot as plt
#plt.ioff()

def prettyPicture(clf, X_test, y_test):
    x_min = 0.0; x_max = 1.0
    y_min = 0.0; y_max = 1.0

    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, m_max]x[y_min, y_max].
    h = .01  # step size in the mesh
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])

    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.xlim(xx.min(), xx.max())
    plt.ylim(yy.min(), yy.max())

    plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)

    # Plot also the test points
    grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
    bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
    grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
    bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]

    plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
    plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
    plt.legend()
    plt.xlabel("bumpiness")
    plt.ylabel("grade")
    plt.show()
    #plt.savefig("test.png")
    
import base64
import json
import subprocess

def output_image(name, format, bytes):
    image_start = "BEGIN_IMAGE_f9825uweof8jw9fj4r8"
    image_end = "END_IMAGE_0238jfw08fjsiufhw8frs"
    data = {}
    data['name'] = name
    data['format'] = format
    data['bytes'] = base64.encodestring(bytes)
    print image_start+json.dumps(data)+image_end









    



c:\Users\fch80_000\Anaconda\lib\site-packages\matplotlib\__init__.py:1256: UserWarning:  This call to matplotlib.use() has no effect
because the backend has already been chosen;
matplotlib.use() must be called *before* pylab, matplotlib.pyplot,
or matplotlib.backends is imported for the first time.

  warnings.warn(_use_error_msg)



In [6]:

    
%load scripts/prep_terrain_data.py



In [7]:

    
#!/usr/bin/python
import random


def makeTerrainData(n_points=1000):
###############################################################################
### make the toy dataset
    random.seed(42)
    grade = [random.random() for ii in range(0,n_points)]
    bumpy = [random.random() for ii in range(0,n_points)]
    error = [random.random() for ii in range(0,n_points)]
    y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
    for ii in range(0, len(y)):
        if grade[ii]>0.8 or bumpy[ii]>0.8:
            y[ii] = 1.0

### split into train/test sets
    X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
    split = int(0.75*n_points)
    X_train = X[0:split]
    X_test  = X[split:]
    y_train = y[0:split]
    y_test  = y[split:]

    grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
    bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
    grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
    bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]

#    training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
#            , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}


    grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
    bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
    grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
    bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]

    test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
            , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}

    return X_train, y_train, X_test, y_test
#    return training_data, test_data

SVM



In [8]:

    
import sys
#from class_vis import prettyPicture
#from prep_terrain_data import makeTerrainData

import matplotlib.pyplot as plt
import copy
import numpy as np
import pylab as pl


features_train, labels_train, features_test, labels_test = makeTerrainData()


########################## SVM #################################
### we handle the import statement and SVC creation for you here
from sklearn.svm import SVC
clf = SVC(kernel="linear")


#### now your job is to fit the classifier
#### using the training features/labels, and to
#### make a set of predictions on the test data


clf.fit(features_train, labels_train)

#### store your predictions in a list named pred

pred = clf.predict(features_test)


from sklearn.metrics import accuracy_score
acc = accuracy_score(pred, labels_test)

def submitAccuracy():
    return acc

print "Accuracy: ", submitAccuracy()









    



Accuracy:  0.92



In [9]:

    
prettyPicture(clf, features_test, labels_test)



In [10]:

    
clf2 = SVC(kernel="linear", gamma=1.0)
clf2.fit(features_train, labels_train)









    Out[10]:





SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, gamma=1.0,
  kernel='linear', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)



In [11]:

    
prettyPicture(clf2, features_test, labels_test)



In [12]:

    
clf2 = SVC(kernel="rbf", C=100000000)
clf2.fit(features_train, labels_train)
prettyPicture(clf2, features_test, labels_test)



In [16]:

    
%load ../ud120-projects/svm/svm_author_id.py



In [ ]:

    
# %%writefile ../ud120-projects/svm/svm_author_id.py
#!/usr/bin/python

""" 
    this is the code to accompany the Lesson 2 (SVM) mini-project

    use an SVM to identify emails from the Enron corpus by their authors
    
    Sara has label 0
    Chris has label 1

"""
    
import sys
from time import time
sys.path.append("../ud120-projects/tools/")
from email_preprocess import preprocess


### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()



#########################################################
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

if __name__ == '__main__':

    clt = SVC(kernel="linear")
    time0 = time()
    clt.fit(features_train, labels_train)
    print "Training time: ", round(time()-t0, 3), "seconds."
    
    t0 = time()
    pred = clt.predict(features_test)
    print "Prediction Time: ", round(time()-t0, 3), "seconds."
    
    acc = accuracy_score(pred, labels_test)
    print "Accuracy: ", acc
    
#########################################################



In [26]:

    
if __name__ == '__main__':
    
    features_train, features_test, labels_train, labels_test = preprocess()
    
    clt = SVC(kernel="linear")
    t0 = time()
    
    features_train = features_train[:len(features_train)/100]
    labels_train = labels_train[:len(labels_train)/100]
    
    clt.fit(features_train, labels_train)
    print "Training time: ", round(time()-t0, 3), "seconds."
    
    t0 = time()
    pred = clt.predict(features_test)
    print "Prediction Time: ", round(time()-t0, 3), "seconds."
    
    acc = accuracy_score(pred, labels_test)
    print "Accuracy: ", acc









    



 no. of Chris training emails: 7936
no. of Sara training emails: 7884
Training time:  0.123 seconds.
Prediction Time:  1.122 seconds.
Accuracy:  0.884527872582



In [27]:

    
if __name__ == '__main__':
    
    features_train, features_test, labels_train, labels_test = preprocess()
    
    clt = SVC(kernel="rbf")
    t0 = time()
    
    features_train = features_train[:len(features_train)/100]
    labels_train = labels_train[:len(labels_train)/100]
    
    clt.fit(features_train, labels_train)
    print "Training time: ", round(time()-t0, 3), "seconds."
    
    t0 = time()
    pred = clt.predict(features_test)
    print "Prediction Time: ", round(time()-t0, 3), "seconds."
    
    acc = accuracy_score(pred, labels_test)
    print "Accuracy: ", acc









    



no. of Chris training emails: 7936
no. of Sara training emails: 7884
Training time:  0.118 seconds.
Prediction Time:  1.241 seconds.
Accuracy:  0.616040955631



In [31]:

    
if __name__ == '__main__':
    
    features_train, features_test, labels_train, labels_test = preprocess()
    
    clt = SVC(kernel="rbf", C=10000.0)
    t0 = time()
    
    features_train = features_train[:len(features_train)/100]
    labels_train = labels_train[:len(labels_train)/100]
    
    clt.fit(features_train, labels_train)
    print "Training time: ", round(time()-t0, 3), "seconds."
    
    t0 = time()
    pred = clt.predict(features_test)
    print "Prediction Time: ", round(time()-t0, 3), "seconds."
    
    acc = accuracy_score(pred, labels_test)
    print "Accuracy: ", acc









    



no. of Chris training emails: 7936
no. of Sara training emails: 7884
Training time:  0.109 seconds.
Prediction Time:  0.964 seconds.
Accuracy:  0.892491467577



In [32]:

    
if __name__ == '__main__':
    
    features_train, features_test, labels_train, labels_test = preprocess()
    
    clt = SVC(kernel="rbf", C=10000.0)
    t0 = time()
    
    #features_train = features_train[:len(features_train)/100]
    #labels_train = labels_train[:len(labels_train)/100]
    
    clt.fit(features_train, labels_train)
    print "Training time: ", round(time()-t0, 3), "seconds."
    
    t0 = time()
    pred = clt.predict(features_test)
    print "Prediction Time: ", round(time()-t0, 3), "seconds."
    
    acc = accuracy_score(pred, labels_test)
    print "Accuracy: ", acc









    



no. of Chris training emails: 7936
no. of Sara training emails: 7884
Training time:  118.905 seconds.
Prediction Time:  12.023 seconds.
Accuracy:  0.990898748578



In [34]:

    
print pred[10]
print pred[26]
print pred[50]



In [38]:

    
np.bincount(pred)









    Out[38]:





array([881, 877], dtype=int64)



In [ ]: