In [ ]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import time
Feel free to adjust and experiment with these parameters after you have completed the lab:
In [ ]:
C = 1
kernel = 'linear'
In [ ]:
# TODO: Change to 200000 once you get to Question#2
iterations = 5000
# You can set this to false if you want to draw the full square matrix:
FAST_DRAW = True
In [ ]:
def drawPlots(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
# You can use this to break any higher-dimensional space down,
# And view cross sections of it.
# If this line throws an error, use plt.style.use('ggplot') instead
mpl.style.use('ggplot') # Look Pretty
padding = 3
resolution = 0.5
max_2d_score = 0
y_colors = ['#ff0000', '#00ff00', '#0000ff']
my_cmap = mpl.colors.ListedColormap(['#ffaaaa', '#aaffaa', '#aaaaff'])
colors = [y_colors[i] for i in y_train]
num_columns = len(X_train.columns)
fig = plt.figure()
fig.canvas.set_window_title(wintitle)
fig.set_tight_layout(True)
cnt = 0
for col in range(num_columns):
for row in range(num_columns):
# Easy out
if FAST_DRAW and col > row:
cnt += 1
continue
ax = plt.subplot(num_columns, num_columns, cnt + 1)
plt.xticks(())
plt.yticks(())
# Intersection:
if col == row:
plt.text(0.5, 0.5, X_train.columns[row], verticalalignment='center', horizontalalignment='center', fontsize=12)
cnt += 1
continue
# Only select two features to display, then train the model
X_train_bag = X_train.ix[:, [row,col]]
X_test_bag = X_test.ix[:, [row,col]]
model.fit(X_train_bag, y_train)
# Create a mesh to plot in
x_min, x_max = X_train_bag.ix[:, 0].min() - padding, X_train_bag.ix[:, 0].max() + padding
y_min, y_max = X_train_bag.ix[:, 1].min() - padding, X_train_bag.ix[:, 1].max() + padding
xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution),
np.arange(y_min, y_max, resolution))
# Plot Boundaries
plt.xlim(xx.min(), xx.max())
plt.ylim(yy.min(), yy.max())
# Prepare the contour
Z = model.predict(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)
plt.contourf(xx, yy, Z, cmap=my_cmap, alpha=0.8)
plt.scatter(X_train_bag.ix[:, 0], X_train_bag.ix[:, 1], c=colors, alpha=0.5)
score = round(model.score(X_test_bag, y_test) * 100, 3)
plt.text(0.5, 0, "Score: {0}".format(score), transform = ax.transAxes, horizontalalignment='center', fontsize=8)
max_2d_score = score if score > max_2d_score else max_2d_score
cnt += 1
print("Max 2D Score: ", max_2d_score)
In [ ]:
def benchmark(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'):
print(wintitle + ' Results')
s = time.time()
for i in range(iterations):
# TODO: train the classifier on the training data / labels:
# .. your code here ..
print("{0} Iterations Training Time: ".format(iterations), time.time() - s)
s = time.time()
for i in range(iterations):
# TODO: score the classifier on the testing data / labels:
# .. your code here ..
print("{0} Iterations Scoring Time: ".format(iterations), time.time() - s)
print("High-Dimensionality Score: ", round((score*100), 3))
Load up the wheat dataset into dataframe X and verify you did it properly. Indices shouldn't be doubled, nor should you have any headers with weird characters...
In [ ]:
# .. your code here ..
In [ ]:
# An easy way to show which rows have nans in them:
X[pd.isnull(X).any(axis=1)]
Go ahead and drop any row with a nan:
In [ ]:
# .. your code here ..
In the future, you might try setting the nan values to the mean value of that column, the mean should only be calculated for the specific class rather than across all classes, now that you have the labels.
Copy the labels out of the dataframe into variable y, then remove them from X.
Encode the labels, using the .map() trick we showed you in Module 5, such that canadian:0, kama:1, and rosa:2.
In [ ]:
# .. your code here ..
Split your data into a test and train set. Your test size should be 30% with random_state 7. Please use variable names: X_train, X_test, y_train, and y_test:
In [ ]:
# .. your code here ..
Create an SVC classifier named svc and use a linear kernel. You already have C defined at the top of the lab, so just set C=C.
In [ ]:
# .. your code here ..
Create an KNeighbors classifier named knn and set the neighbor count to 5:
In [ ]:
# .. your code here ..
In [ ]:
benchmark(knn, X_train, X_test, y_train, y_test, 'KNeighbors')
drawPlots(knn, X_train, X_test, y_train, y_test, 'KNeighbors')
In [ ]:
benchmark(svc, X_train, X_test, y_train, y_test, 'SVC')
drawPlots(svc, X_train, X_test, y_train, y_test, 'SVC')
In [ ]:
plt.show()
After submitting your answers, mess around with the gamma, kernel, and C values.
In [ ]: