In [1]:
from sklearn.naive_bayes import GaussianNB
import numpy as np
%matplotlib inline
import os
import pprint
In [3]:
%load scripts/class_vis.py
In [2]:
#!/usr/bin/python
#from udacityplots import *
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
#import numpy as np
#import matplotlib.pyplot as plt
#plt.ioff()
def prettyPicture(clf, X_test, y_test):
x_min = 0.0; x_max = 1.0
y_min = 0.0; y_max = 1.0
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
h = .01 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)
# Plot also the test points
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.show()
#plt.savefig("test.png")
import base64
import json
import subprocess
def output_image(name, format, bytes):
image_start = "BEGIN_IMAGE_f9825uweof8jw9fj4r8"
image_end = "END_IMAGE_0238jfw08fjsiufhw8frs"
data = {}
data['name'] = name
data['format'] = format
data['bytes'] = base64.encodestring(bytes)
print image_start+json.dumps(data)+image_end
In [6]:
%load scripts/prep_terrain_data.py
In [3]:
#!/usr/bin/python
import random
def makeTerrainData(n_points=1000):
###############################################################################
### make the toy dataset
random.seed(42)
grade = [random.random() for ii in range(0,n_points)]
bumpy = [random.random() for ii in range(0,n_points)]
error = [random.random() for ii in range(0,n_points)]
y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
for ii in range(0, len(y)):
if grade[ii]>0.8 or bumpy[ii]>0.8:
y[ii] = 1.0
### split into train/test sets
X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
split = int(0.75*n_points)
X_train = X[0:split]
X_test = X[split:]
y_train = y[0:split]
y_test = y[split:]
grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]
# training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
# , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
, "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
return X_train, y_train, X_test, y_test
# return training_data, test_data
In [14]:
%load scripts/ClassifyNB.py
In [4]:
def classify(features_train, labels_train):
### import the sklearn module for GaussianNB
### create classifier
### fit the classifier on the training features and labels
### return the fit classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(features_train, labels_train)
return clf
In [7]:
#!/usr/bin/python
""" Complete the code in ClassifyNB.py with the sklearn
Naive Bayes classifier to classify the terrain data.
The objective of this exercise is to recreate the decision
boundary found in the lesson video, and make a plot that
visually shows the decision boundary """
# from prep_terrain_data import makeTerrainData
# from class_vis import prettyPicture, output_image
# from ClassifyNB import classify
import numpy as np
import pylab as pl
features_train, labels_train, features_test, labels_test = makeTerrainData()
### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
### in together--separate them so we can give them different colors in the scatterplot,
### and visually identify them
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
clf = classify(features_train, labels_train)
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)
# output_image("test.png", "png", open("test.png", "rb").read())
In [5]:
def NBAccuracy(features_train, labels_train, features_test, labels_test):
""" compute the accuracy of your Naive Bayes classifier """
### import the sklearn module for GaussianNB
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
### create classifier
clf = GaussianNB()
### fit the classifier on the training features and labels
clf.fit(features_train, labels_train)
### use the trained classifier to predict labels for the test features
pred = clf.predict(features_test)
### calculate and return the accuracy on the test data
### this is slightly different than the example,
### where we just print the accuracy
### you might need to import an sklearn module
accuracy = accuracy_score(labels_test, pred)
return accuracy
In [6]:
features_train, labels_train, features_test, labels_test = makeTerrainData()
def submitAccuracy():
accuracy = NBAccuracy(features_train, labels_train, features_test, labels_test)
return accuracy
In [7]:
print NBAccuracy(features_train, labels_train, features_test, labels_test)
In [41]:
# Load nb_author_id.py for the quiz.
%load ../ud120-projects/naive_bayes/nb_author_id.py
In [8]:
# Write the content of this cell as a python script.
# Using the same name, overwrite nb_author_id.py
# %%writefile ../ud120-projects/naive_bayes/nb_author_id.py
#!/usr/bin/python
"""
this is the code to accompany the Lesson 1 (Naive Bayes) mini-project
use a Naive Bayes Classifier to identify emails by their authors
authors and labels:
Sara has label 0
Chris has label 1
"""
import sys
from time import time
sys.path.append("../ud120-projects/tools/")
from email_preprocess import preprocess
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score
### features_train and features_test are the features for the training
### and testing datasets, respectively
### labels_train and labels_test are the corresponding item labels
features_train, features_test, labels_train, labels_test = preprocess()
#########################################################
NB = GaussianNB()
t0 = time()
NB.fit(features_train, labels_train)
print "Training Time: ", round(time()-t0, 3), "seconds."
t0 = time()
pred = NB.predict(features_test)
print "Prediction Time: ", round(time()-t0, 3), "seconds."
accuracy = accuracy_score(pred, labels_test)
print "Accuracy: ", accuracy
#########################################################
In [ ]:
DATADIR = "Temp2/Intro to Machine Learning/ud120-projects"
DATAFILE = "README.md"
datafile = os.path.join(os.path.expanduser("~"), DATADIR ,DATAFILE )
In [9]:
os.listdir('.')
Out[9]:
In [ ]: