In [1]:
import numpy as np
X = np.array([[-1,-1], [-2,-1], [-3,-2], [1,1], [2,1], [3,2]])
print "X =\n",X,"\n"
Y = np.array([1,1,1,2,2,2])
print "Y =",Y,"\n"
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X,Y)
print "Predict class for [-0.8,-1] =>",(clf.predict([[-0.8,-1]]))
In [2]:
#!/usr/bin/python
""" Complete the code in ClassifyNB.py with the sklearn
Naive Bayes classifier to classify the terrain data.
The objective of this exercise is to recreate the decision
boundary found in the lesson video, and make a plot that
visually shows the decision boundary """
from prep_terrain_data import makeTerrainData
from class_vis import prettyPicture, output_image
from ClassifyNB import classify
import numpy as np
import pylab as pl
features_train, labels_train, features_test, labels_test = makeTerrainData()
### the training data (features_train, labels_train) have both "fast" and "slow" points mixed
### in together--separate them so we can give them different colors in the scatterplot,
### and visually identify them
grade_fast = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==0]
bumpy_fast = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==0]
grade_slow = [features_train[ii][0] for ii in range(0, len(features_train)) if labels_train[ii]==1]
bumpy_slow = [features_train[ii][1] for ii in range(0, len(features_train)) if labels_train[ii]==1]
clf = classify(features_train, labels_train)
### draw the decision boundary with the text points overlaid
prettyPicture(clf, features_test, labels_test)
output_image("test.png", "png", open("test.png", "rb").read())
In [ ]:
#!/usr/bin/python
#from udacityplots import *
import matplotlib
matplotlib.use('agg')
import matplotlib.pyplot as plt
import pylab as pl
import numpy as np
#import numpy as np
#import matplotlib.pyplot as plt
#plt.ioff()
def prettyPicture(clf, X_test, y_test):
x_min = 0.0; x_max = 1.0
y_min = 0.0; y_max = 1.0
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
h = .01 # step size in the mesh
xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
plt.pcolormesh(xx, yy, Z, cmap=pl.cm.seismic)
# Plot also the test points
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
plt.scatter(grade_sig, bumpy_sig, color = "b", label="fast")
plt.scatter(grade_bkg, bumpy_bkg, color = "r", label="slow")
plt.legend()
plt.xlabel("bumpiness")
plt.ylabel("grade")
plt.savefig("test.png")
import base64
import json
import subprocess
def output_image(name, format, bytes):
image_start = "BEGIN_IMAGE_f9825uweof8jw9fj4r8"
image_end = "END_IMAGE_0238jfw08fjsiufhw8frs"
data = {}
data['name'] = name
data['format'] = format
data['bytes'] = base64.encodestring(bytes)
print image_start+json.dumps(data)+image_end
In [ ]:
#!/usr/bin/python
import random
def makeTerrainData(n_points=1000):
###############################################################################
### make the toy dataset
random.seed(42)
grade = [random.random() for ii in range(0,n_points)]
bumpy = [random.random() for ii in range(0,n_points)]
error = [random.random() for ii in range(0,n_points)]
y = [round(grade[ii]*bumpy[ii]+0.3+0.1*error[ii]) for ii in range(0,n_points)]
for ii in range(0, len(y)):
if grade[ii]>0.8 or bumpy[ii]>0.8:
y[ii] = 1.0
### split into train/test sets
X = [[gg, ss] for gg, ss in zip(grade, bumpy)]
split = int(0.75*n_points)
X_train = X[0:split]
X_test = X[split:]
y_train = y[0:split]
y_test = y[split:]
grade_sig = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==0]
bumpy_sig = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==0]
grade_bkg = [X_train[ii][0] for ii in range(0, len(X_train)) if y_train[ii]==1]
bumpy_bkg = [X_train[ii][1] for ii in range(0, len(X_train)) if y_train[ii]==1]
# training_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
# , "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
grade_sig = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==0]
bumpy_sig = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==0]
grade_bkg = [X_test[ii][0] for ii in range(0, len(X_test)) if y_test[ii]==1]
bumpy_bkg = [X_test[ii][1] for ii in range(0, len(X_test)) if y_test[ii]==1]
test_data = {"fast":{"grade":grade_sig, "bumpiness":bumpy_sig}
, "slow":{"grade":grade_bkg, "bumpiness":bumpy_bkg}}
return X_train, y_train, X_test, y_test
# return training_data, test_data
In [ ]:
def classify(features_train, labels_train):
### import the sklearn module for GaussianNB
### create classifier
### fit the classifier on the training features and labels
### return the fit classifier
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
return clf.fit(features_train,labels_train)
In [ ]:
def NBAccuracy(features_train, labels_train, features_test, labels_test):
""" compute the accuracy of your Naive Bayes classifier """
### import the sklearn module for GaussianNB
from sklearn.naive_bayes import GaussianNB
### create classifier
clf = GaussianNB()
### fit the classifier on the training features and labels
clf.fit(features_train,labels_train)
### use the trained classifier to predict labels for the test features
pred = clf.predict(features_test)
### calculate and return the accuracy on the test data
### this is slightly different than the example,
### where we just print the accuracy
### you might need to import an sklearn module
accuracy = clf.score(features_test,labels_test)
return accuracy
In [ ]:
from class_vis import prettyPicture
from prep_terrain_data import makeTerrainData
from classify import NBAccuracy
import matplotlib.pyplot as plt
import numpy as np
import pylab as pl
features_train, labels_train, features_test, labels_test = makeTerrainData()
def submitAccuracy():
accuracy = NBAccuracy(features_train, labels_train, features_test, labels_test)
return accuracy
In [20]:
prob_c_times_prob_pos_given_c = round(0.01 * 0.9, 3)
prob_no_c_times_prob_pos_given_no_c = round(0.99 * 0.1, 3)
print prob_c_times_prob_pos_given_c, prob_no_c_times_prob_pos_given_no_c
prob_pos = (prob_c_times_prob_pos_given_c + prob_no_c_times_prob_pos_given_no_c)
print round(prob_pos, 3)
print round(prob_c_times_prob_pos_given_c/prob_pos, 3)
algorithm for Bayes Rule
$P(C)$ prior
$P(Pos|C)$ sensitivity
$P(Neg|\neg{C})$ specitivity
$P(C) \cdot P(Pos|C) = P(Pos, C)$
$P(C) \cdot P(Pos|\neg{C}) = P(Pos, \neg{C})$
$P(Pos, C) + P(Pos, \neg{C}) = P(Pos)$
$\frac{P(Pos, C)}{P(Pos)} = P(C|Pos)$
$\frac{P(Pos, \neg{C})}{P(Pos)} = P(\neg{C}|Pos)$
In [ ]: