Competition Testing

Load the required libraries



In [1]:

    
import scipy
import numpy as np
import pandas as pd
import plotly.plotly as py

import visplots

from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from sklearn import preprocessing, metrics
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from scipy.stats.distributions import randint

init_notebook_mode()

print("libraries all imported, ready to go")









    











    



libraries all imported, ready to go

Importing the data



In [2]:

    
# Import the data and explore the first few rows
dataset_path = "./processed_data/spam_dataset.csv"
dataset = pd.read_csv(dataset_path, sep=",", )
dataset_name = "Spam"

dataset.head()









    



---------------------------------------------------------------------------
IOError                                   Traceback (most recent call last)
<ipython-input-2-cca6f6bcd3b7> in <module>()
      1 # Import the data and explore the first few rows
      2 dataset_path = "./processed_data/spam_dataset.csv"
----> 3 dataset = pd.read_csv(dataset_path, sep=",", )
      4 dataset_name = "Spam"
      5 

/Users/natashal/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in parser_f(filepath_or_buffer, sep, dialect, compression, doublequote, escapechar, quotechar, quoting, skipinitialspace, lineterminator, header, index_col, names, prefix, skiprows, skipfooter, skip_footer, na_values, true_values, false_values, delimiter, converters, dtype, usecols, engine, delim_whitespace, as_recarray, na_filter, compact_ints, use_unsigned, low_memory, buffer_lines, warn_bad_lines, error_bad_lines, keep_default_na, thousands, comment, decimal, parse_dates, keep_date_col, dayfirst, date_parser, memory_map, float_precision, nrows, iterator, chunksize, verbose, encoding, squeeze, mangle_dupe_cols, tupleize_cols, infer_datetime_format, skip_blank_lines)
    496                     skip_blank_lines=skip_blank_lines)
    497 
--> 498         return _read(filepath_or_buffer, kwds)
    499 
    500     parser_f.__name__ = name

/Users/natashal/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _read(filepath_or_buffer, kwds)
    273 
    274     # Create the parser.
--> 275     parser = TextFileReader(filepath_or_buffer, **kwds)
    276 
    277     if (nrows is not None) and (chunksize is not None):

/Users/natashal/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, f, engine, **kwds)
    588             self.options['has_index_names'] = kwds['has_index_names']
    589 
--> 590         self._make_engine(self.engine)
    591 
    592     def _get_options_with_defaults(self, engine):

/Users/natashal/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in _make_engine(self, engine)
    729     def _make_engine(self, engine='c'):
    730         if engine == 'c':
--> 731             self._engine = CParserWrapper(self.f, **self.options)
    732         else:
    733             if engine == 'python':

/Users/natashal/anaconda/lib/python2.7/site-packages/pandas/io/parsers.pyc in __init__(self, src, **kwds)
   1101         kwds['allow_leading_cols'] = self.index_col is not False
   1102 
-> 1103         self._reader = _parser.TextReader(src, **kwds)
   1104 
   1105         # XXX

pandas/parser.pyx in pandas.parser.TextReader.__cinit__ (pandas/parser.c:3246)()

pandas/parser.pyx in pandas.parser.TextReader._setup_parser_source (pandas/parser.c:6111)()

IOError: File ./processed_data/spam_dataset.csv does not exist

Split the data into input features, X, and outputs, y



In [ ]:

    
cols = dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
dataset = dataset[cols]
dataset.head()



In [ ]:

    
dataset = dataset.dropna()



In [ ]:

    
# Convert to numpy array and check the dimensionality

npArray = np.array(dataset)
print(npArray.shape)



In [ ]:

    
header = dataset.columns.values
header



In [ ]:

    
# Split to input matrix X and class vector y

X = npArray[:,:-1].astype(float)
y = npArray[:,-1]

# Print the dimensions of X and y

print ("X dimensions:", X.shape)
print ("y dimensions:", y.shape)

Exploratory Data Analysis

Plot y frequencies



In [ ]:

    
# Print the y frequencies

yFreq = scipy.stats.itemfreq(y)
print(yFreq)



In [ ]:

    
# Copy out the class labels (useful for plotting)

class_lables = [yFreq[0][0], yFreq[1][0]]
print(class_lables)



In [ ]:

    
# Convert the categorical to numeric values, and print the y frequencies

le = preprocessing.LabelEncoder()
y  = le.fit_transform(y)

yFreq = scipy.stats.itemfreq(y)
print(yFreq)



In [ ]:

    
# Display the y frequencies in a barplot with Plotly

# (1) Create the Data object
data = [
    Bar(
        x = [class_lables[0], class_lables[1]],
        y = [yFreq[0][1], yFreq[1][1]],
        marker = dict(color=['blue','red'])
    )
]

# (2) Create a Layout object
layout = Layout(
    xaxis = dict(title = dataset_name),
    yaxis = dict(title = "Count"),
    width = 500
)

# (3) Create a Figure object
fig = dict(data = data, layout = layout)

# (4) Plot
iplot(fig)

Data distributions



In [ ]:

    
dataset.describe()



In [ ]:

    
# Create a boxplot of the raw data

nrow, ncol = X.shape

data = [
    Box(
        y = X[:,i],        # values to be used for box plot
        name = header[i],  # label (on hover and x-axis)
        marker = dict(color = "purple"),
    ) for i in range(ncol)
]

layout = Layout(
    xaxis = dict(title = "Feature"),
    yaxis = dict(title = "Value"),
    showlegend=False,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
# Alternatively

data = [
    Box(
        y = X[:,i],     
        name = header[i],  
        boxpoints='all',
        jitter=0.4,
        whiskerwidth=0.2,
        marker=dict(
            size=2,
        ),
        line=dict(width=1),
        boxmean='sd'
    ) for i in range(X.shape[1])
]

layout = Layout(
    xaxis = dict(title = "Feature", tickangle=40), 
    yaxis = dict(title = "Value"), 
    showlegend=False,
    height=700, 
    margin=Margin(b=170, t=50), 
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
# Create a boxplot of the scaled data

X_scaled = preprocessing.scale(X)

nrow, ncol = X.shape

data = [
    Box(
        y = X_scaled[:,i],        # values to be used for box plot
        name = header[i],  # label (on hover and x-axis)
        marker = dict(color = "purple"),
    ) for i in range(ncol)
]

layout = Layout(
    xaxis = dict(title = "Feature"),
    yaxis = dict(title = "Value"),
    showlegend=False,
)

fig = dict(data = data, layout = layout)

iplot(fig)

Plot pairs of input features X as scatter plots



In [ ]:

    
# Create a scatter plot of the first two features

f1 = 0
f2 = 3 

data = [
    Scatter(
        x = X[:, f1],
        y = X[:, f2],
        mode = "markers"
    )
]

layout = Layout(
    xaxis = dict(title = header[f1]),
    yaxis = dict(title = header[f2])
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
# Create an enhanced scatter plot of the first two features

f1 = 0
f2 = 3

# Low quality (class "1") represented with red x
trace1 = Scatter(
    x = X[y == 1, f1],
    y = X[y == 1, f2],
    mode = 'markers',
    name = 'Low Quality ("1")',
    marker = dict(
        color  = 'red',
        symbol = 'x'
    )
)

# High quality (class "0") represented with blue circles
trace2 = Scatter(
    x = X[y == 0, f1],
    y = X[y == 0, f2],
    mode = 'markers',
    name = 'High Quality ("0")',
    marker = dict(
        color  = 'blue',
        symbol = 'circle'
    )
)

layout = Layout(
    xaxis = dict(title = header[f1], type='log'),
    yaxis = dict(title = header[f2], type='log'),
    height= 600,
)

fig = dict(data = [trace1, trace2], layout = layout)

iplot(fig)

Scatterplot Matrix



In [ ]:

    
# Create a grid plot of scatterplots using a combination of features

from plotly import tools

fig = tools.make_subplots(rows=4, cols=4, shared_xaxes=True, shared_yaxes=True)

for row in range(0, 4): 
    for col in range(0, 4): 
        # red x, Low quality
        trace1 = Scatter(
            x = X[y == 1, col],
            y = X[y == 1, row],
            mode = 'markers',
            marker = dict(
                color  = 'red',
                symbol = 'x',
                opacity = .5
            )
        )
        # blue circles, High quality
        trace2 = Scatter(
            x = X[y == 0, col],
            y = X[y == 0, row],
            mode = 'markers',
            marker = dict(
                color  = 'blue',
                symbol = 'circle',
                opacity = .5
            )
        )
        posX = row+1
        posY = col+1
        fig.append_trace(trace1, posX, posY)
        fig.append_trace(trace2, posX, posY)
        fig['layout']['xaxis'+str(posX)].update(title=header[row])
        fig['layout']['yaxis'+str(posY)].update(title=header[col])

fig['layout'].update(
    showlegend=False, 
    height=900, 
)

iplot(fig)

3D Scatter



In [ ]:

    
# Create a 3D scatterplot using the first three features

f1 = 0
f2 = 1
f3 = 2

desc = dict(
    classes = [1, 0],
    colors  = ["red", "blue"],
    labels  = ['Low Quality ("1")', 'High Quality ("0")'],
    symbols = ["x", "circle"]
)

data = [
    Scatter3d(
        x = X[y == desc["classes"][i], f1],
        y = X[y == desc["classes"][i], f2],
        z = X[y == desc["classes"][i], f3],
        name = desc["labels"][i],
        mode = "markers",
        marker = dict(
            size = 2.5,
            symbol = desc["symbols"][i],
            color  = desc["colors"][i]
        )
    ) for i in range(len(desc["labels"]))
]

layout = Layout(
    scene=Scene(
        xaxis=XAxis(title=header[f1], titlefont=dict(size=11)),
        yaxis=YAxis(title=header[f2], titlefont=dict(size=11)),
        zaxis=ZAxis(title=header[f3], titlefont=dict(size=11))
    ),
    margin=Margin(l=80, r=80, b=0, t=0, pad=0, autoexpand=True),
    height= 600,
)

fig = dict(data = data, layout = layout)

iplot(fig)

Correlation Matrix



In [ ]:

    
# Calculate the correlation coefficient

correlationMatrix = np.corrcoef(X_scaled, rowvar=0)
correlationMatrix



In [ ]:

    
# Create a heatmap of the correlation coefficients
data = [
    Heatmap(
        x = header,             # sites on both
        y = header,             #  axes
        z = correlationMatrix,  # correlation as color contours 
        colorscale='RdOrBl',    # light yellow-orange-red colormap
        reversescale=True       # inverse colormap order
    )
]

layout = Layout(
    xaxis = dict(title = "Feature"), 
    yaxis = dict(title = "Feature"), 
    margin= Margin(l=250),
    height = 700,
)

fig = dict(data = data, layout = layout)

iplot(fig)

Machine Learning

1) Apply KNN classification algorithm

Split the data into training and test sets



In [ ]:

    
# Scale the dataset

X_scaled = preprocessing.scale(X)



In [ ]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X_scaled, y, random_state=1)



In [ ]:

    
# Print the dimensionality of the individual splits

print ("XTrain dimensions: ", XTrain.shape)
print ("yTrain dimensions: ", yTrain.shape)
print ("XTest dimensions: ", XTest.shape)
print ("yTest dimensions: ", yTest.shape)



In [ ]:

    
# Calculate the frequency of classes in yTest

yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [ ]:

    
# Build a KNN classifier with 3 nearest neighbors

knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(XTrain, yTrain)
yPredK3 = knn3.predict(XTest)

print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK3), 2))



In [ ]:

    
# Build a KNN classifier with 99 nearest neighbors

knn99 = KNeighborsClassifier(n_neighbors=99)
knn99.fit(XTrain, yTrain)
yPredK99 = knn99.predict(XTest)

print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK99), 2))

Calculate validation metrics for your classifier



In [ ]:

    
# Get the confusion matrix for your classifier using metrics.confusion_matrix

mat = metrics.confusion_matrix(yTest, yPredK3) 
print (mat)



In [ ]:

    
# Report the metrics using metrics.classification_report

print (metrics.classification_report(yTest, yPredK3))
print ("accuracy: ", round(metrics.accuracy_score(yTest, yPredK3), 2))

Plot the decision boundaries for di€fferent models



In [ ]:

    
# Check the arguments of the function
help(visplots.knnDecisionPlot)

# Visualise the boundaries
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3)



In [ ]:

    
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 99)

Di€fferent weight configurations



In [ ]:

    
# Build the classifier with two pre-defined parameters (n_neighbors and weights)

# Visualise the boundaries of a KNN model with weights equal to "distance"

knnW3 = KNeighborsClassifier(n_neighbors=3, weights='distance')
knnW3.fit(XTrain, yTrain)
predictedW3 = knnW3.predict(XTest)

print (metrics.classification_report(yTest, predictedW3))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predictedW3), 2))

visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3, weights="distance")

K-fold cross-validation



In [ ]:

    
# Implement cross-validation for knn3 

knn3scores = cross_val_score(knn3, XTrain, yTrain, cv = 5)
print (knn3scores)
print ("Mean of scores KNN3", knn3scores.mean())

Grid search on hyperparameters



In [ ]:

    
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters

n_neighbors = np.arange(1, 51, 2)  # odd numbers of neighbors used
weights     = ['uniform','distance']
parameters  = [{'n_neighbors': n_neighbors, 'weights': weights}]

gridCV = GridSearchCV(KNeighborsClassifier(), parameters, cv=10, n_jobs=-1)
gridCV.fit(XTrain, yTrain)

# Print the optimal parameters

bestNeighbors = gridCV.best_params_['n_neighbors'] 
bestWeight    = gridCV.best_params_['weights']

print ("Best parameters: n_neighbors=", bestNeighbors, "and weight=", bestWeight)



In [ ]:

    
# grid_scores_ contains parameter settings and scores
scores = np.zeros((len(n_neighbors), len(weights)))

for score in gridCV.grid_scores_:
    ne = score[0]['n_neighbors']
    i = np.argmax(n_neighbors == ne)
    j = 0 if (score[0]['weights'] == 'uniform') else 1
    scores[i,j] = score[1]



In [ ]:

    
# Visualise the grid search results using a heatmap

# Make a heatmap with the performance
data = [
    Heatmap(
        x = n_neighbors,
        y = weights,
        z = scores.T,
        colorscale='Jet',
        reversescale=True,
        colorbar = dict(
            title = "Classification Accuracy",
            len = 5,
            nticks=10
        )
    )
]

layout = Layout(
    xaxis = dict(title = "Number of K nearest neighbors", tickvals = n_neighbors),
    yaxis = dict(title = "Weights"),
    height= 230,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
# Build the classifier using the optimal parameters detected by grid search 

knn = KNeighborsClassifier(n_neighbors = bestNeighbors, weights = bestWeight)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)

print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))

Randomized search on hyperparameters



In [ ]:

    
# Conduct a randomised search on hyperparameters

parameters = {'n_neighbors': randint(1,200)}

randomCV = RandomizedSearchCV(KNeighborsClassifier(), 
                              param_distributions=parameters, n_iter=20)
randomCV.fit(XTrain, yTrain)

# Print the optimal n_neighbors detected by randomised search
bestNeighbors = randomCV.best_params_['n_neighbors']
print("Best parameters: n_neighbors=", bestNeighbors)



In [ ]:

    
neighbor = [score_tuple[0]['n_neighbors'] for score_tuple in randomCV.grid_scores_] 
result   = [score_tuple[1] for score_tuple in randomCV.grid_scores_]



In [ ]:

    
# Visualise the randomised search results using a scatterplot

data = [
    Scatter(
        x = neighbor,
        y = result,
        mode = "markers"
    )
]

layout = Layout(
    xaxis = dict(title = "Number of k nearest neighbors"), 
    yaxis = dict(title = "Classification Accuracy"),
    height = 500, 
    width = 900,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
# Build the classifier using the optimal parameters detected by randomised search

knn = KNeighborsClassifier(n_neighbors=bestNeighbors)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)

print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))

2) Decision Tree



In [ ]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)



In [ ]:

    
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [ ]:

    
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(XTrain, yTrain)
predDT = dtc.predict(XTest)

print (metrics.classification_report(yTest, predDT))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predDT),2))

visplots.dtDecisionPlot(XTrain, yTrain, XTest, yTest, header, max_depth=3)

3) Random Forests



In [ ]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)



In [ ]:

    
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [ ]:

    
# Build a Random Forest classifier with 100 decision trees

rf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=4)
rf.fit(XTrain, yTrain)
predRF = rf.predict(XTest)

print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))

Visualising the RF accuracy



In [ ]:

    
# Visualise the average accuracy 

visplots.rfAvgAcc(rfModel = rf, XTest =XTest, yTest=yTest)

Feature Importance



In [ ]:

    
# Display the importance of the features in a barplot

importance = rf.feature_importances_
names = header[0:10]

data = [
    Bar(
        x = importance,
        y = names,
        orientation = 'h',
    )
]

layout = Layout(
    xaxis = dict(title = "Importance of features"),
    yaxis = dict(title = "Features"),
    width = 800,
    margin=Margin(
        l=250,
        r=50,
        b=100,
        t=50,
        pad=4
    ),
)

fig = dict(data = data, layout = layout)

iplot(fig)

Boundary visualisation



In [ ]:

    
# Check the arguments of the function
help(visplots.rfDecisionPlot)

# Visualise the boundaries
visplots.rfDecisionPlot(XTrain, yTrain, XTest, yTest, header)

Tuning Random Forests



In [ ]:

    
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters

# Parameters you can investigate include:
n_estimators = np.arange(1, 30, 5)
max_depth    = np.arange(1, 100, 5)

# Also, you may choose any of the following
# max_features = [1, 3, 10]
# min_samples_split = [1, 3, 10]
# min_samples_leaf  = [1, 3, 10]
# bootstrap = [True, False]
# criterion = ["gini", "entropy"]

parameters   = [{'n_estimators': n_estimators, 'max_depth': max_depth}]

gridCV = GridSearchCV(RandomForestClassifier(), param_grid=parameters, cv=10, n_jobs=4)
gridCV.fit(XTrain, yTrain)


# Print the optimal parameters

best_n_estim      = gridCV.best_params_['n_estimators']
best_max_depth    = gridCV.best_params_['max_depth']

print ("Best parameters: n_estimators=", best_n_estim,", max_depth=", best_max_depth)



In [ ]:

    
# Build the classifier using the optimal parameters detected by grid search

clfRDF = RandomForestClassifier(n_estimators=best_n_estim, max_depth=best_max_depth)
clfRDF.fit(XTrain, yTrain)
predRF = clfRDF.predict(XTest)

print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))

Visualise the scores of the grid search



In [ ]:

    
# Create a heatmap like the one you made when you applied GridSearchCV to KNN

# reorganisig the scores in a matrix
scores = np.zeros((len(n_estimators), len(max_depth)))

for score in gridCV.grid_scores_:
    ne = score[0]['n_estimators']
    md = score[0]['max_depth']
    i = np.argmax(n_estimators == ne)
    j = np.argmax(max_depth == md)
    scores[i,j] = score[1]

# Make a heatmap with the performance
data = [
    Heatmap(
        x = n_estimators,
        y = max_depth,
        z = scores.T,
        colorscale='Jet',
        reversescale=True,
        colorbar = dict(
            title = "Classification Accuracy",
            nticks=10
        )
    )
]

layout = Layout(
    xaxis = dict(title = "Number of estimators"),
    yaxis = dict(title = "Max Depth", tickvals = max_depth ),
    height = 800,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
submit_dataset_path = "./processed_data/spam_submit.csv"
submit_dataset = pd.read_csv(submit_dataset_path, sep=",", )
submit_dataset.head()



In [ ]:

    
cols = submit_dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
submit_dataset = submit_dataset[cols]
submit_dataset.head()



In [ ]:

    
npArray = np.array(submit_dataset)
submit_X = npArray[:,:-1].astype(float)
submit_y = npArray[:,-1]



In [ ]:

    
le = preprocessing.LabelEncoder()
submit_y  = le.fit_transform(submit_y)



In [ ]:

    
predRF = clfRDF.predict(submit_X)

predRF

# print (metrics.classification_report(submit_y, predRF))
# print ("Overall Accuracy:", round(metrics.accuracy_score(submit_y, predRF),2))

Parallelisation



In [ ]:

    
# Change the value of n_jobs and estimate the excution time of fit

import timeit

n_jobs_to_try = np.arange(1,9)
elapsed_times = []

for jobs in n_jobs_to_try:
    start_time = timeit.default_timer()
    rf = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=jobs)
    rf.fit(XTrain, yTrain)
    elapsed = timeit.default_timer() - start_time
    elapsed_times.append(elapsed)



In [ ]:

    
data = [
    Bar(
        x=n_jobs_to_try,
        y=elapsed_times
    )
]

fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))

iplot(fig)



In [ ]:

    
data = [
    Scatter(
        x=n_jobs_to_try,
        y=elapsed_times,
        mode='markers+lines',
    )
]

fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))

iplot(fig)

4) Support Vector Machines (SVMs)

Linear SVMs



In [ ]:

    
# Load libraries

from sklearn.svm import SVC
import matplotlib.pyplot as plt

import matplotlib_visplots

%matplotlib inline



In [ ]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)



In [ ]:

    
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [ ]:

    
linearSVM = SVC(kernel='linear', C=1.0)
linearSVM.fit(XTrain, yTrain)
yPredLinear = linearSVM.predict(XTest)

print metrics.classification_report(yTest, yPredLinear)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredLinear),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)

### Non-linear (RBF) SVMs

matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'linear')

Non-linear (RBF) SVMs



In [ ]:

    
rbfSVM = SVC(kernel='rbf', C=1.0, gamma=0.0)
rbfSVM.fit(XTrain, yTrain)
yPredRBF = rbfSVM.predict(XTest)

print metrics.classification_report(yTest, yPredRBF)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredRBF),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)

matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'rbf')

Hyperparameter Tuning for non-linear SVMs



In [ ]:

    
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
g_range = 2. ** np.arange(-15, 5, step=2)
C_range = 2. ** np.arange(-5, 15, step=2)

parameters = [{'gamma': g_range, 'C': C_range}] 

grid = GridSearchCV(SVC(), parameters, cv= 10)  
grid.fit(XTrain, yTrain)

bestG = grid.best_params_['gamma']
bestC = grid.best_params_['C']
print "The best parameters are: gamma=", np.log2(bestG), " and Cost=", np.log2(bestC)



In [ ]:

    
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(g_range))

plt.figure(figsize=(10, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(g_range)), np.log2(g_range))
plt.yticks(np.arange(len(C_range)), np.log2(C_range))
plt.xlabel('gamma (log2)')
plt.ylabel('Cost (log2)')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)
plt.show()

5) Logistic Regression



In [ ]:

    
from sklearn.linear_model import LogisticRegression



In [ ]:

    
l_regression = LogisticRegression()
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)

print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.logregDecisionPlot)

matplotlib_visplots.logregDecisionPlot(XTrain, yTrain, XTest, yTest)

Tuning Logistic Regression



In [ ]:

    
# Define the parameters to be optimised and their values/ranges
# Range for pen and C hyperparameters
pen = ['l1','l2']
C_range = 2. ** np.arange(-5, 15, step=2)

parameters = [{'C': C_range, 'penalty': pen}]

grid = GridSearchCV(LogisticRegression(), parameters, cv= 10)
grid.fit(XTrain, yTrain)

bestC = grid.best_params_['C']
bestP = grid.best_params_['penalty']
print "The best parameters are: cost=", bestC , " and penalty=", bestP



In [ ]:

    
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(pen), len(C_range))
scores = np.transpose(scores)

plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(pen)), pen)
plt.yticks(np.arange(len(C_range)), C_range)
plt.xlabel('penalisation norm')
plt.ylabel('inv regularisation strength')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)

plt.show()



In [ ]:

    
l_regression = LogisticRegression(C=bestC, penalty=bestP)
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)

print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)

6) Neural Networks



In [ ]:

    
from multilayer_perceptron import multilayer_perceptron



In [ ]:

    
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(activation='logistic', 
                                                            hidden_layer_sizes=2, learning_rate_init=.5)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)

print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.nnDecisionPlot)

matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, 2, .5)
matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, (2,3,6), .5)

Tuning Neural Nets



In [ ]:

    
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
layer_size_range = [(3,2),(10,10),(2,2,2),10,5] # different networks shapes
learning_rate_range = np.linspace(.1,1,3)

parameters = [{'hidden_layer_sizes': layer_size_range, 'learning_rate_init': learning_rate_range}]

grid = GridSearchCV(multilayer_perceptron.MultilayerPerceptronClassifier(), parameters, cv= 10)
grid.fit(XTrain, yTrain)

best_size    = grid.best_params_['hidden_layer_sizes']
best_best_lr = grid.best_params_['learning_rate_init']
print "The best parameters are: hidden_layer_sizes=", best_size, " and learning_rate_init=", best_best_lr



In [ ]:

    
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(layer_size_range), len(learning_rate_range))
scores = np.transpose(scores)

plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(layer_size_range)), layer_size_range)
plt.yticks(np.arange(len(learning_rate_range)), learning_rate_range)
plt.xlabel('hidden layer topology')
plt.ylabel('learning rate')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)

plt.show()



In [ ]:

    
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(hidden_layer_sizes=best_size, learning_rate_init=best_best_lr)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)

print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)



In [ ]: