In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
matplotlib.style.use('ggplot') # Look Pretty
def plotDecisionBoundary(model, X, y):
fig = plt.figure()
ax = fig.add_subplot(111)
padding = 0.6
resolution = 0.0025
colors = ['royalblue','forestgreen','ghostwhite']
# Calculate the boundaris
x_min, x_max = X[:, 0].min(), X[:, 0].max()
y_min, y_max = X[:, 1].min(), X[:, 1].max()
x_range = x_max - x_min
y_range = y_max - y_min
x_min -= x_range * padding
y_min -= y_range * padding
x_max += x_range * padding
y_max += y_range * padding
# Create a 2D Grid Matrix. The values stored in the matrix
# are the predictions of the class at at said location
xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
np.arange(y_min, y_max, resolution))
# What class does the classifier say?
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
# Plot the contour map
cs = plt.contourf(xx, yy, Z, cmap=plt.cm.terrain)
# Plot the test original points as well...
for label in range(len(np.unique(y))):
indices = np.where(y == label)
plt.scatter(X[indices, 0], X[indices, 1], c=colors[label], label=str(label), alpha=0.8)
p = model.get_params()
plt.axis('tight')
plt.title('K = ' + str(p['n_neighbors']))
In [2]:
# TODO: Load up the dataset into a variable called X. Check the .head and
# compare it to the file you loaded in a text editor. Make sure you're
# loading your data properly--don't fail on the 1st step!
#
# TODO: Basic nan munging. Fill each row's nans with the mean of the feature
df = pd.read_csv('Datasets/wheat.data')
df = df.fillna(df.mean())
# TODO: Copy the 'wheat_type' series slice out of X, and into a series
# called 'y'. Then drop the original 'wheat_type' column from the X
X = df
y = X.wheat_type
del X['wheat_type']
# TODO: Do a quick, "ordinal" conversion of 'y'. In actuality our
# classification isn't ordinal, but just as an experiment...
#this is in Chapter 2: Feature representation
ordered_type = ['canadian', 'rosa', 'kama']
y = y.astype('category', ordered = True, categories = ordered_type).cat.codes
In [3]:
# TODO: Split X into training and testing data sets using train_test_split().
# INFO: Use 0.33 test size, and use random_state=1. This is important
# so that your answers are verifiable. In the real world, you wouldn't
# specify a random_state.
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=14)
In [4]:
list(set(y_train))
Out[4]:
In [5]:
# TODO: Create an instance of SKLearn's Normalizer class and then train it
# using its .fit() method against your *training* data.
#
# NOTE: The reason you only fit against your training data is because in a
# real-world situation, you'll only have your training data to train with!
# In this lab setting, you have both train+test data; but in the wild,
# you'll only have your training data, and then unlabeled data you want to
# apply your models to.
#
from sklearn.preprocessing import Normalizer
norm = Normalizer()
norm.fit(x_train)
Out[5]:
In [6]:
# TODO: With your trained pre-processor, transform both your training AND
# testing data.
#
# NOTE: Any testing data has to be transformed with your preprocessor
# that has ben fit against your training data, so that it exist in the same
# feature-space as the original data used to train your models.
x_transform_train = norm.transform(x_train)
x_transform_test = norm.transform(x_test)
In [7]:
# TODO: Just like your preprocessing transformation, create a PCA
# transformation as well. Fit it against your training data, and then
# project your training and testing features into PCA space using the
# PCA model's .transform() method.
#
# NOTE: This has to be done because the only way to visualize the decision
# boundary in 2D would be if your KNN algo ran in 2D as well:
#
from sklearn.decomposition import PCA
pca = PCA(n_components = 2, svd_solver = 'full')
pca.fit(x_train)
T_train = pca.transform(x_train)
T_test = pca.transform(x_test)
print(T_train.shape, T_test.shape)
In [8]:
# TODO: Create and train a KNeighborsClassifier. Start with K=9 neighbors.
# NOTE: Be sure train your classifier against the pre-processed, PCA-
# transformed training data above! You do not, of course, need to transform
# your labels.
from sklearn.neighbors import KNeighborsClassifier
knc = KNeighborsClassifier(n_neighbors = 9)
knc.fit(T_train, y_train)
Out[8]:
In [ ]:
# TODO: Create and train a KNeighborsClassifier. Start with K=9 neighbors.
# NOTE: Be sure train your classifier against the pre-processed, PCA-
# transformed training data above! You do not, of course, need to transform
# your labels.
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 9)
knn.fit(T_train, y_train)
Out[ ]:
In [ ]:
# HINT: Ensure your KNeighbors classifier object from earlier is called 'knn'
plotDecisionBoundary(knn, T_train, y_train)
#------------------------------------
#
# TODO: Display the accuracy score of your test data/labels, computed by
# your KNeighbors model.
#
# NOTE: You do NOT have to run .predict before calling .score, since
# .score will take care of running your predictions for you automatically.
#
knn.score(T_test, y_test)
#
# BONUS: Instead of the ordinal conversion, try and get this assignment
# working with a proper Pandas get_dummies for feature encoding. HINT:
# You might have to update some of the plotDecisionBoundary code.
plt.show()
In [ ]: