In [1]:
import scipy
import numpy as np
import pandas as pd
import plotly.plotly as py
import visplots
from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from sklearn import preprocessing, metrics
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from scipy.stats.distributions import randint
init_notebook_mode()
print("libraries all imported, ready to go")
In [2]:
# Import the data and explore the first few rows
dataset_path = "./processed_data/spam_dataset.csv"
dataset = pd.read_csv(dataset_path, sep=",", )
dataset_name = "Spam"
dataset.head()
In [ ]:
cols = dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
dataset = dataset[cols]
dataset.head()
In [ ]:
dataset = dataset.dropna()
In [ ]:
# Convert to numpy array and check the dimensionality
npArray = np.array(dataset)
print(npArray.shape)
In [ ]:
header = dataset.columns.values
header
In [ ]:
# Split to input matrix X and class vector y
X = npArray[:,:-1].astype(float)
y = npArray[:,-1]
# Print the dimensions of X and y
print ("X dimensions:", X.shape)
print ("y dimensions:", y.shape)
In [ ]:
# Print the y frequencies
yFreq = scipy.stats.itemfreq(y)
print(yFreq)
In [ ]:
# Copy out the class labels (useful for plotting)
class_lables = [yFreq[0][0], yFreq[1][0]]
print(class_lables)
In [ ]:
# Convert the categorical to numeric values, and print the y frequencies
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
yFreq = scipy.stats.itemfreq(y)
print(yFreq)
In [ ]:
# Display the y frequencies in a barplot with Plotly
# (1) Create the Data object
data = [
Bar(
x = [class_lables[0], class_lables[1]],
y = [yFreq[0][1], yFreq[1][1]],
marker = dict(color=['blue','red'])
)
]
# (2) Create a Layout object
layout = Layout(
xaxis = dict(title = dataset_name),
yaxis = dict(title = "Count"),
width = 500
)
# (3) Create a Figure object
fig = dict(data = data, layout = layout)
# (4) Plot
iplot(fig)
In [ ]:
dataset.describe()
In [ ]:
# Create a boxplot of the raw data
nrow, ncol = X.shape
data = [
Box(
y = X[:,i], # values to be used for box plot
name = header[i], # label (on hover and x-axis)
marker = dict(color = "purple"),
) for i in range(ncol)
]
layout = Layout(
xaxis = dict(title = "Feature"),
yaxis = dict(title = "Value"),
showlegend=False,
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Alternatively
data = [
Box(
y = X[:,i],
name = header[i],
boxpoints='all',
jitter=0.4,
whiskerwidth=0.2,
marker=dict(
size=2,
),
line=dict(width=1),
boxmean='sd'
) for i in range(X.shape[1])
]
layout = Layout(
xaxis = dict(title = "Feature", tickangle=40),
yaxis = dict(title = "Value"),
showlegend=False,
height=700,
margin=Margin(b=170, t=50),
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Create a boxplot of the scaled data
X_scaled = preprocessing.scale(X)
nrow, ncol = X.shape
data = [
Box(
y = X_scaled[:,i], # values to be used for box plot
name = header[i], # label (on hover and x-axis)
marker = dict(color = "purple"),
) for i in range(ncol)
]
layout = Layout(
xaxis = dict(title = "Feature"),
yaxis = dict(title = "Value"),
showlegend=False,
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Create a scatter plot of the first two features
f1 = 0
f2 = 3
data = [
Scatter(
x = X[:, f1],
y = X[:, f2],
mode = "markers"
)
]
layout = Layout(
xaxis = dict(title = header[f1]),
yaxis = dict(title = header[f2])
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Create an enhanced scatter plot of the first two features
f1 = 0
f2 = 3
# Low quality (class "1") represented with red x
trace1 = Scatter(
x = X[y == 1, f1],
y = X[y == 1, f2],
mode = 'markers',
name = 'Low Quality ("1")',
marker = dict(
color = 'red',
symbol = 'x'
)
)
# High quality (class "0") represented with blue circles
trace2 = Scatter(
x = X[y == 0, f1],
y = X[y == 0, f2],
mode = 'markers',
name = 'High Quality ("0")',
marker = dict(
color = 'blue',
symbol = 'circle'
)
)
layout = Layout(
xaxis = dict(title = header[f1], type='log'),
yaxis = dict(title = header[f2], type='log'),
height= 600,
)
fig = dict(data = [trace1, trace2], layout = layout)
iplot(fig)
In [ ]:
# Create a grid plot of scatterplots using a combination of features
from plotly import tools
fig = tools.make_subplots(rows=4, cols=4, shared_xaxes=True, shared_yaxes=True)
for row in range(0, 4):
for col in range(0, 4):
# red x, Low quality
trace1 = Scatter(
x = X[y == 1, col],
y = X[y == 1, row],
mode = 'markers',
marker = dict(
color = 'red',
symbol = 'x',
opacity = .5
)
)
# blue circles, High quality
trace2 = Scatter(
x = X[y == 0, col],
y = X[y == 0, row],
mode = 'markers',
marker = dict(
color = 'blue',
symbol = 'circle',
opacity = .5
)
)
posX = row+1
posY = col+1
fig.append_trace(trace1, posX, posY)
fig.append_trace(trace2, posX, posY)
fig['layout']['xaxis'+str(posX)].update(title=header[row])
fig['layout']['yaxis'+str(posY)].update(title=header[col])
fig['layout'].update(
showlegend=False,
height=900,
)
iplot(fig)
In [ ]:
# Create a 3D scatterplot using the first three features
f1 = 0
f2 = 1
f3 = 2
desc = dict(
classes = [1, 0],
colors = ["red", "blue"],
labels = ['Low Quality ("1")', 'High Quality ("0")'],
symbols = ["x", "circle"]
)
data = [
Scatter3d(
x = X[y == desc["classes"][i], f1],
y = X[y == desc["classes"][i], f2],
z = X[y == desc["classes"][i], f3],
name = desc["labels"][i],
mode = "markers",
marker = dict(
size = 2.5,
symbol = desc["symbols"][i],
color = desc["colors"][i]
)
) for i in range(len(desc["labels"]))
]
layout = Layout(
scene=Scene(
xaxis=XAxis(title=header[f1], titlefont=dict(size=11)),
yaxis=YAxis(title=header[f2], titlefont=dict(size=11)),
zaxis=ZAxis(title=header[f3], titlefont=dict(size=11))
),
margin=Margin(l=80, r=80, b=0, t=0, pad=0, autoexpand=True),
height= 600,
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Calculate the correlation coefficient
correlationMatrix = np.corrcoef(X_scaled, rowvar=0)
correlationMatrix
In [ ]:
# Create a heatmap of the correlation coefficients
data = [
Heatmap(
x = header, # sites on both
y = header, # axes
z = correlationMatrix, # correlation as color contours
colorscale='RdOrBl', # light yellow-orange-red colormap
reversescale=True # inverse colormap order
)
]
layout = Layout(
xaxis = dict(title = "Feature"),
yaxis = dict(title = "Feature"),
margin= Margin(l=250),
height = 700,
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Scale the dataset
X_scaled = preprocessing.scale(X)
In [ ]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X_scaled, y, random_state=1)
In [ ]:
# Print the dimensionality of the individual splits
print ("XTrain dimensions: ", XTrain.shape)
print ("yTrain dimensions: ", yTrain.shape)
print ("XTest dimensions: ", XTest.shape)
print ("yTest dimensions: ", yTest.shape)
In [ ]:
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)
In [ ]:
# Build a KNN classifier with 3 nearest neighbors
knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(XTrain, yTrain)
yPredK3 = knn3.predict(XTest)
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK3), 2))
In [ ]:
# Build a KNN classifier with 99 nearest neighbors
knn99 = KNeighborsClassifier(n_neighbors=99)
knn99.fit(XTrain, yTrain)
yPredK99 = knn99.predict(XTest)
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK99), 2))
In [ ]:
# Get the confusion matrix for your classifier using metrics.confusion_matrix
mat = metrics.confusion_matrix(yTest, yPredK3)
print (mat)
In [ ]:
# Report the metrics using metrics.classification_report
print (metrics.classification_report(yTest, yPredK3))
print ("accuracy: ", round(metrics.accuracy_score(yTest, yPredK3), 2))
In [ ]:
# Check the arguments of the function
help(visplots.knnDecisionPlot)
# Visualise the boundaries
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3)
In [ ]:
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 99)
In [ ]:
# Build the classifier with two pre-defined parameters (n_neighbors and weights)
# Visualise the boundaries of a KNN model with weights equal to "distance"
knnW3 = KNeighborsClassifier(n_neighbors=3, weights='distance')
knnW3.fit(XTrain, yTrain)
predictedW3 = knnW3.predict(XTest)
print (metrics.classification_report(yTest, predictedW3))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predictedW3), 2))
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3, weights="distance")
In [ ]:
# Implement cross-validation for knn3
knn3scores = cross_val_score(knn3, XTrain, yTrain, cv = 5)
print (knn3scores)
print ("Mean of scores KNN3", knn3scores.mean())
In [ ]:
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters
n_neighbors = np.arange(1, 51, 2) # odd numbers of neighbors used
weights = ['uniform','distance']
parameters = [{'n_neighbors': n_neighbors, 'weights': weights}]
gridCV = GridSearchCV(KNeighborsClassifier(), parameters, cv=10, n_jobs=-1)
gridCV.fit(XTrain, yTrain)
# Print the optimal parameters
bestNeighbors = gridCV.best_params_['n_neighbors']
bestWeight = gridCV.best_params_['weights']
print ("Best parameters: n_neighbors=", bestNeighbors, "and weight=", bestWeight)
In [ ]:
# grid_scores_ contains parameter settings and scores
scores = np.zeros((len(n_neighbors), len(weights)))
for score in gridCV.grid_scores_:
ne = score[0]['n_neighbors']
i = np.argmax(n_neighbors == ne)
j = 0 if (score[0]['weights'] == 'uniform') else 1
scores[i,j] = score[1]
In [ ]:
# Visualise the grid search results using a heatmap
# Make a heatmap with the performance
data = [
Heatmap(
x = n_neighbors,
y = weights,
z = scores.T,
colorscale='Jet',
reversescale=True,
colorbar = dict(
title = "Classification Accuracy",
len = 5,
nticks=10
)
)
]
layout = Layout(
xaxis = dict(title = "Number of K nearest neighbors", tickvals = n_neighbors),
yaxis = dict(title = "Weights"),
height= 230,
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Build the classifier using the optimal parameters detected by grid search
knn = KNeighborsClassifier(n_neighbors = bestNeighbors, weights = bestWeight)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)
print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))
In [ ]:
# Conduct a randomised search on hyperparameters
parameters = {'n_neighbors': randint(1,200)}
randomCV = RandomizedSearchCV(KNeighborsClassifier(),
param_distributions=parameters, n_iter=20)
randomCV.fit(XTrain, yTrain)
# Print the optimal n_neighbors detected by randomised search
bestNeighbors = randomCV.best_params_['n_neighbors']
print("Best parameters: n_neighbors=", bestNeighbors)
In [ ]:
neighbor = [score_tuple[0]['n_neighbors'] for score_tuple in randomCV.grid_scores_]
result = [score_tuple[1] for score_tuple in randomCV.grid_scores_]
In [ ]:
# Visualise the randomised search results using a scatterplot
data = [
Scatter(
x = neighbor,
y = result,
mode = "markers"
)
]
layout = Layout(
xaxis = dict(title = "Number of k nearest neighbors"),
yaxis = dict(title = "Classification Accuracy"),
height = 500,
width = 900,
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Build the classifier using the optimal parameters detected by randomised search
knn = KNeighborsClassifier(n_neighbors=bestNeighbors)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)
print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))
In [ ]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)
In [ ]:
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)
In [ ]:
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(XTrain, yTrain)
predDT = dtc.predict(XTest)
print (metrics.classification_report(yTest, predDT))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predDT),2))
visplots.dtDecisionPlot(XTrain, yTrain, XTest, yTest, header, max_depth=3)
In [ ]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)
In [ ]:
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)
In [ ]:
# Build a Random Forest classifier with 100 decision trees
rf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=4)
rf.fit(XTrain, yTrain)
predRF = rf.predict(XTest)
print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))
In [ ]:
# Visualise the average accuracy
visplots.rfAvgAcc(rfModel = rf, XTest =XTest, yTest=yTest)
In [ ]:
# Display the importance of the features in a barplot
importance = rf.feature_importances_
names = header[0:10]
data = [
Bar(
x = importance,
y = names,
orientation = 'h',
)
]
layout = Layout(
xaxis = dict(title = "Importance of features"),
yaxis = dict(title = "Features"),
width = 800,
margin=Margin(
l=250,
r=50,
b=100,
t=50,
pad=4
),
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
# Check the arguments of the function
help(visplots.rfDecisionPlot)
# Visualise the boundaries
visplots.rfDecisionPlot(XTrain, yTrain, XTest, yTest, header)
In [ ]:
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters
# Parameters you can investigate include:
n_estimators = np.arange(1, 30, 5)
max_depth = np.arange(1, 100, 5)
# Also, you may choose any of the following
# max_features = [1, 3, 10]
# min_samples_split = [1, 3, 10]
# min_samples_leaf = [1, 3, 10]
# bootstrap = [True, False]
# criterion = ["gini", "entropy"]
parameters = [{'n_estimators': n_estimators, 'max_depth': max_depth}]
gridCV = GridSearchCV(RandomForestClassifier(), param_grid=parameters, cv=10, n_jobs=4)
gridCV.fit(XTrain, yTrain)
# Print the optimal parameters
best_n_estim = gridCV.best_params_['n_estimators']
best_max_depth = gridCV.best_params_['max_depth']
print ("Best parameters: n_estimators=", best_n_estim,", max_depth=", best_max_depth)
In [ ]:
# Build the classifier using the optimal parameters detected by grid search
clfRDF = RandomForestClassifier(n_estimators=best_n_estim, max_depth=best_max_depth)
clfRDF.fit(XTrain, yTrain)
predRF = clfRDF.predict(XTest)
print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))
In [ ]:
# Create a heatmap like the one you made when you applied GridSearchCV to KNN
# reorganisig the scores in a matrix
scores = np.zeros((len(n_estimators), len(max_depth)))
for score in gridCV.grid_scores_:
ne = score[0]['n_estimators']
md = score[0]['max_depth']
i = np.argmax(n_estimators == ne)
j = np.argmax(max_depth == md)
scores[i,j] = score[1]
# Make a heatmap with the performance
data = [
Heatmap(
x = n_estimators,
y = max_depth,
z = scores.T,
colorscale='Jet',
reversescale=True,
colorbar = dict(
title = "Classification Accuracy",
nticks=10
)
)
]
layout = Layout(
xaxis = dict(title = "Number of estimators"),
yaxis = dict(title = "Max Depth", tickvals = max_depth ),
height = 800,
)
fig = dict(data = data, layout = layout)
iplot(fig)
In [ ]:
submit_dataset_path = "./processed_data/spam_submit.csv"
submit_dataset = pd.read_csv(submit_dataset_path, sep=",", )
submit_dataset.head()
In [ ]:
cols = submit_dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
submit_dataset = submit_dataset[cols]
submit_dataset.head()
In [ ]:
npArray = np.array(submit_dataset)
submit_X = npArray[:,:-1].astype(float)
submit_y = npArray[:,-1]
In [ ]:
le = preprocessing.LabelEncoder()
submit_y = le.fit_transform(submit_y)
In [ ]:
predRF = clfRDF.predict(submit_X)
predRF
# print (metrics.classification_report(submit_y, predRF))
# print ("Overall Accuracy:", round(metrics.accuracy_score(submit_y, predRF),2))
In [ ]:
# Change the value of n_jobs and estimate the excution time of fit
import timeit
n_jobs_to_try = np.arange(1,9)
elapsed_times = []
for jobs in n_jobs_to_try:
start_time = timeit.default_timer()
rf = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=jobs)
rf.fit(XTrain, yTrain)
elapsed = timeit.default_timer() - start_time
elapsed_times.append(elapsed)
In [ ]:
data = [
Bar(
x=n_jobs_to_try,
y=elapsed_times
)
]
fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))
iplot(fig)
In [ ]:
data = [
Scatter(
x=n_jobs_to_try,
y=elapsed_times,
mode='markers+lines',
)
]
fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))
iplot(fig)
In [ ]:
# Load libraries
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import matplotlib_visplots
%matplotlib inline
In [ ]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)
In [ ]:
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)
In [ ]:
linearSVM = SVC(kernel='linear', C=1.0)
linearSVM.fit(XTrain, yTrain)
yPredLinear = linearSVM.predict(XTest)
print metrics.classification_report(yTest, yPredLinear)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredLinear),2)
In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)
### Non-linear (RBF) SVMs
matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'linear')
In [ ]:
rbfSVM = SVC(kernel='rbf', C=1.0, gamma=0.0)
rbfSVM.fit(XTrain, yTrain)
yPredRBF = rbfSVM.predict(XTest)
print metrics.classification_report(yTest, yPredRBF)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredRBF),2)
In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)
matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'rbf')
In [ ]:
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
g_range = 2. ** np.arange(-15, 5, step=2)
C_range = 2. ** np.arange(-5, 15, step=2)
parameters = [{'gamma': g_range, 'C': C_range}]
grid = GridSearchCV(SVC(), parameters, cv= 10)
grid.fit(XTrain, yTrain)
bestG = grid.best_params_['gamma']
bestC = grid.best_params_['C']
print "The best parameters are: gamma=", np.log2(bestG), " and Cost=", np.log2(bestC)
In [ ]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(g_range))
plt.figure(figsize=(10, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(g_range)), np.log2(g_range))
plt.yticks(np.arange(len(C_range)), np.log2(C_range))
plt.xlabel('gamma (log2)')
plt.ylabel('Cost (log2)')
cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)
plt.show()
In [ ]:
from sklearn.linear_model import LogisticRegression
In [ ]:
l_regression = LogisticRegression()
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)
print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)
In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.logregDecisionPlot)
matplotlib_visplots.logregDecisionPlot(XTrain, yTrain, XTest, yTest)
In [ ]:
# Define the parameters to be optimised and their values/ranges
# Range for pen and C hyperparameters
pen = ['l1','l2']
C_range = 2. ** np.arange(-5, 15, step=2)
parameters = [{'C': C_range, 'penalty': pen}]
grid = GridSearchCV(LogisticRegression(), parameters, cv= 10)
grid.fit(XTrain, yTrain)
bestC = grid.best_params_['C']
bestP = grid.best_params_['penalty']
print "The best parameters are: cost=", bestC , " and penalty=", bestP
In [ ]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(pen), len(C_range))
scores = np.transpose(scores)
plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(pen)), pen)
plt.yticks(np.arange(len(C_range)), C_range)
plt.xlabel('penalisation norm')
plt.ylabel('inv regularisation strength')
cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)
plt.show()
In [ ]:
l_regression = LogisticRegression(C=bestC, penalty=bestP)
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)
print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)
In [ ]:
from multilayer_perceptron import multilayer_perceptron
In [ ]:
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(activation='logistic',
hidden_layer_sizes=2, learning_rate_init=.5)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)
print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)
In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.nnDecisionPlot)
matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, 2, .5)
matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, (2,3,6), .5)
In [ ]:
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
layer_size_range = [(3,2),(10,10),(2,2,2),10,5] # different networks shapes
learning_rate_range = np.linspace(.1,1,3)
parameters = [{'hidden_layer_sizes': layer_size_range, 'learning_rate_init': learning_rate_range}]
grid = GridSearchCV(multilayer_perceptron.MultilayerPerceptronClassifier(), parameters, cv= 10)
grid.fit(XTrain, yTrain)
best_size = grid.best_params_['hidden_layer_sizes']
best_best_lr = grid.best_params_['learning_rate_init']
print "The best parameters are: hidden_layer_sizes=", best_size, " and learning_rate_init=", best_best_lr
In [ ]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(layer_size_range), len(learning_rate_range))
scores = np.transpose(scores)
plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(layer_size_range)), layer_size_range)
plt.yticks(np.arange(len(learning_rate_range)), learning_rate_range)
plt.xlabel('hidden layer topology')
plt.ylabel('learning rate')
cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)
plt.show()
In [ ]:
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(hidden_layer_sizes=best_size, learning_rate_init=best_best_lr)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)
print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)
In [ ]: