Competition Testing

Load the required libraries


In [1]:
import scipy
import numpy as np
import pandas as pd
import plotly.plotly as py

import visplots

from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from sklearn import preprocessing, metrics
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from scipy.stats.distributions import randint

init_notebook_mode()

print("libraries all imported, ready to go")


libraries all imported, ready to go

Importing the data


In [4]:
# Import the data and explore the first few rows
dataset_path = "spam_dataset.csv"
dataset = pd.read_csv(dataset_path, sep=",")
dataset_name = "Spam"

dataset.head()


Out[4]:
email_id is_spam word_freq_will word_freq_original word_freq_415 word_freq_mail char_freq_# char_freq_$ word_freq_internet word_freq_edu ... word_freq_receive word_freq_000 capital_run_length_average word_freq_address word_freq_george word_freq_cs word_freq_random word_freq_conference word_freq_technology char_freq_(
0 3628 no 0.00 0 0 0.00 0 0 0.0 0 ... 0.00 0 2.000 0.00 0.00 0 0 0 0 0.000
1 63 no 0.00 0 0 0.49 0 0 0.0 0 ... 0.00 0 2.824 0.00 0.99 0 0 0 0 0.062
2 1540 no 1.31 0 0 0.00 0 0 0.0 0 ... 0.00 0 2.176 0.00 0.00 0 0 0 0 0.431
3 4460 yes 0.75 0 0 0.50 0 0 0.5 0 ... 0.25 0 1.023 0.75 0.00 0 0 0 0 0.180
4 2771 no 0.00 0 0 0.00 0 0 0.0 0 ... 0.00 0 1.500 0.00 1.56 0 0 0 0 0.180

5 rows × 63 columns

Split the data into input features, X, and outputs, y


In [5]:
cols = dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
dataset = dataset[cols]
dataset.head()


Out[5]:
word_freq_will word_freq_original word_freq_415 word_freq_mail char_freq_# char_freq_$ word_freq_internet word_freq_edu word_freq_hp word_freq_lab ... word_freq_000 capital_run_length_average word_freq_address word_freq_george word_freq_cs word_freq_random word_freq_conference word_freq_technology char_freq_( is_spam
0 0.00 0 0 0.00 0 0 0.0 0 0 0 ... 0 2.000 0.00 0.00 0 0 0 0 0.000 no
1 0.00 0 0 0.49 0 0 0.0 0 0 0 ... 0 2.824 0.00 0.99 0 0 0 0 0.062 no
2 1.31 0 0 0.00 0 0 0.0 0 0 0 ... 0 2.176 0.00 0.00 0 0 0 0 0.431 no
3 0.75 0 0 0.50 0 0 0.5 0 0 0 ... 0 1.023 0.75 0.00 0 0 0 0 0.180 yes
4 0.00 0 0 0.00 0 0 0.0 0 0 0 ... 0 1.500 0.00 1.56 0 0 0 0 0.180 no

5 rows × 62 columns


In [6]:
dataset = dataset.dropna()

In [8]:
# Convert to numpy array and check the dimensionality
npArray = np.array(dataset)
print(npArray.shape)


(1000, 62)

In [12]:
header = dataset.columns.values
header[1:10]


Out[12]:
array(['word_freq_original', 'word_freq_415', 'word_freq_mail',
       'char_freq_#', 'char_freq_$', 'word_freq_internet', 'word_freq_edu',
       'word_freq_hp', 'word_freq_lab'], dtype=object)

In [13]:
# Split to input matrix X and class vector y
X = npArray[:,:-1].astype(float)
y = npArray[:,-1]

# Print the dimensions of X and y

print ("X dimensions:", X.shape)
print ("y dimensions:", y.shape)


('X dimensions:', (1000, 61))
('y dimensions:', (1000,))

Exploratory Data Analysis

Plot y frequencies


In [14]:
# Print the y frequencies
yFreq = scipy.stats.itemfreq(y)
print(yFreq)


[['no' 668]
 ['yes' 332]]

In [15]:
# Copy out the class labels (useful for plotting)
class_lables = [yFreq[0][0], yFreq[1][0]]
print(class_lables)


['no', 'yes']

In [16]:
# Convert the categorical to numeric values, and print the y frequencies

le = preprocessing.LabelEncoder()
y  = le.fit_transform(y)

yFreq = scipy.stats.itemfreq(y)
print(yFreq)


[[  0 668]
 [  1 332]]

In [17]:
# Display the y frequencies in a barplot with Plotly

# (1) Create the Data object
data = [
    Bar(
        x = [class_lables[0], class_lables[1]],
        y = [yFreq[0][1], yFreq[1][1]],
        marker = dict(color=['blue','red'])
    )
]

# (2) Create a Layout object
layout = Layout(
    xaxis = dict(title = dataset_name),
    yaxis = dict(title = "Count"),
    width = 500
)

# (3) Create a Figure object
fig = dict(data = data, layout = layout)

# (4) Plot
iplot(fig)


Data distributions


In [18]:
dataset.describe()


Out[18]:
word_freq_will word_freq_original word_freq_415 word_freq_mail char_freq_# char_freq_$ word_freq_internet word_freq_edu word_freq_hp word_freq_lab ... word_freq_receive word_freq_000 capital_run_length_average word_freq_address word_freq_george word_freq_cs word_freq_random word_freq_conference word_freq_technology char_freq_(
count 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000.00000 1000.000000 1000.000000 ... 1000.000000 1000.000000 1000.000000 1000.000000 1000.000000 1000 1000 1000.000000 1000.000000 1000.000000
mean 0.537950 0.038370 0.054690 0.189840 0.022792 0.066014 0.073210 0.18100 0.611970 0.118610 ... 0.051040 0.081300 4.857610 0.149980 0.775740 0 0 0.036690 0.125580 0.144783
std 0.831747 0.173041 0.365678 0.496022 0.109007 0.248239 0.270431 0.86285 1.734907 0.746169 ... 0.192314 0.358906 30.226395 0.955315 3.509211 0 0 0.268434 0.449092 0.232423
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 ... 0.000000 0.000000 1.000000 0.000000 0.000000 0 0 0.000000 0.000000 0.000000
25% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 ... 0.000000 0.000000 1.541000 0.000000 0.000000 0 0 0.000000 0.000000 0.000000
50% 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 0.000000 0.000000 ... 0.000000 0.000000 2.219500 0.000000 0.000000 0 0 0.000000 0.000000 0.072000
75% 0.820000 0.000000 0.000000 0.000000 0.000000 0.016000 0.000000 0.00000 0.315000 0.000000 ... 0.000000 0.000000 3.396500 0.000000 0.000000 0 0 0.000000 0.000000 0.195000
max 6.250000 2.220000 4.760000 5.260000 1.410000 4.017000 3.570000 10.00000 20.830000 14.280000 ... 2.000000 5.450000 667.000000 14.280000 33.330000 0 0 5.000000 4.760000 2.941000

8 rows × 61 columns


In [19]:
# Create a boxplot of the raw data

nrow, ncol = X.shape

data = [
    Box(
        y = X[:,i],        # values to be used for box plot
        name = header[i],  # label (on hover and x-axis)
        marker = dict(color = "purple"),
    ) for i in range(ncol)
]

layout = Layout(
    xaxis = dict(title = "Feature"),
    yaxis = dict(title = "Value"),
    showlegend=False,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [20]:
# Alternatively

data = [
    Box(
        y = X[:,i],     
        name = header[i],  
        boxpoints='all',
        jitter=0.4,
        whiskerwidth=0.2,
        marker=dict(
            size=2,
        ),
        line=dict(width=1),
        boxmean='sd'
    ) for i in range(X.shape[1])
]

layout = Layout(
    xaxis = dict(title = "Feature", tickangle=40), 
    yaxis = dict(title = "Value"), 
    showlegend=False,
    height=700, 
    margin=Margin(b=170, t=50), 
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [21]:
# Create a boxplot of the scaled data

X_scaled = preprocessing.scale(X)

nrow, ncol = X.shape

data = [
    Box(
        y = X_scaled[:,i],        # values to be used for box plot
        name = header[i],  # label (on hover and x-axis)
        marker = dict(color = "purple"),
    ) for i in range(ncol)
]

layout = Layout(
    xaxis = dict(title = "Feature"),
    yaxis = dict(title = "Value"),
    showlegend=False,
)

fig = dict(data = data, layout = layout)

iplot(fig)


Plot pairs of input features X as scatter plots


In [22]:
# Create a scatter plot of the first two features

f1 = 0
f2 = 3 

data = [
    Scatter(
        x = X[:, f1],
        y = X[:, f2],
        mode = "markers"
    )
]

layout = Layout(
    xaxis = dict(title = header[f1]),
    yaxis = dict(title = header[f2])
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [23]:
# Create an enhanced scatter plot of the first two features

f1 = 0
f2 = 3

# Low quality (class "1") represented with red x
trace1 = Scatter(
    x = X[y == 1, f1],
    y = X[y == 1, f2],
    mode = 'markers',
    name = 'Low Quality ("1")',
    marker = dict(
        color  = 'red',
        symbol = 'x'
    )
)

# High quality (class "0") represented with blue circles
trace2 = Scatter(
    x = X[y == 0, f1],
    y = X[y == 0, f2],
    mode = 'markers',
    name = 'High Quality ("0")',
    marker = dict(
        color  = 'blue',
        symbol = 'circle'
    )
)

layout = Layout(
    xaxis = dict(title = header[f1], type='log'),
    yaxis = dict(title = header[f2], type='log'),
    height= 600,
)

fig = dict(data = [trace1, trace2], layout = layout)

iplot(fig)


Scatterplot Matrix


In [24]:
# Create a grid plot of scatterplots using a combination of features

from plotly import tools

fig = tools.make_subplots(rows=4, cols=4, shared_xaxes=True, shared_yaxes=True)

for row in range(0, 4): 
    for col in range(0, 4): 
        # red x, Low quality
        trace1 = Scatter(
            x = X[y == 1, col],
            y = X[y == 1, row],
            mode = 'markers',
            marker = dict(
                color  = 'red',
                symbol = 'x',
                opacity = .5
            )
        )
        # blue circles, High quality
        trace2 = Scatter(
            x = X[y == 0, col],
            y = X[y == 0, row],
            mode = 'markers',
            marker = dict(
                color  = 'blue',
                symbol = 'circle',
                opacity = .5
            )
        )
        posX = row+1
        posY = col+1
        fig.append_trace(trace1, posX, posY)
        fig.append_trace(trace2, posX, posY)
        fig['layout']['xaxis'+str(posX)].update(title=header[row])
        fig['layout']['yaxis'+str(posY)].update(title=header[col])

fig['layout'].update(
    showlegend=False, 
    height=900, 
)

iplot(fig)


This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y1 ]  [ (1,3) x3,y1 ]  [ (1,4) x4,y1 ]
[ (2,1) x1,y2 ]  [ (2,2) x2,y2 ]  [ (2,3) x3,y2 ]  [ (2,4) x4,y2 ]
[ (3,1) x1,y3 ]  [ (3,2) x2,y3 ]  [ (3,3) x3,y3 ]  [ (3,4) x4,y3 ]
[ (4,1) x1,y4 ]  [ (4,2) x2,y4 ]  [ (4,3) x3,y4 ]  [ (4,4) x4,y4 ]

3D Scatter


In [25]:
# Create a 3D scatterplot using the first three features

f1 = 0
f2 = 1
f3 = 2

desc = dict(
    classes = [1, 0],
    colors  = ["red", "blue"],
    labels  = ['Low Quality ("1")', 'High Quality ("0")'],
    symbols = ["x", "circle"]
)

data = [
    Scatter3d(
        x = X[y == desc["classes"][i], f1],
        y = X[y == desc["classes"][i], f2],
        z = X[y == desc["classes"][i], f3],
        name = desc["labels"][i],
        mode = "markers",
        marker = dict(
            size = 2.5,
            symbol = desc["symbols"][i],
            color  = desc["colors"][i]
        )
    ) for i in range(len(desc["labels"]))
]

layout = Layout(
    scene=Scene(
        xaxis=XAxis(title=header[f1], titlefont=dict(size=11)),
        yaxis=YAxis(title=header[f2], titlefont=dict(size=11)),
        zaxis=ZAxis(title=header[f3], titlefont=dict(size=11))
    ),
    margin=Margin(l=80, r=80, b=0, t=0, pad=0, autoexpand=True),
    height= 600,
)

fig = dict(data = data, layout = layout)

iplot(fig)


Correlation Matrix


In [26]:
# Calculate the correlation coefficient

correlationMatrix = np.corrcoef(X_scaled, rowvar=0)
correlationMatrix


Out[26]:
array([[ 1.        , -0.03426666, -0.03644783, ...,  0.12823004,
         0.03589436, -0.05449204],
       [-0.03426666,  1.        ,  0.13868821, ..., -0.00811994,
         0.08081546,  0.10140862],
       [-0.03644783,  0.13868821,  1.        , ..., -0.01850327,
         0.75572733,  0.40890266],
       ..., 
       [ 0.12823004, -0.00811994, -0.01850327, ...,  1.        ,
        -0.00952603, -0.03624095],
       [ 0.03589436,  0.08081546,  0.75572733, ..., -0.00952603,
         1.        ,  0.32595725],
       [-0.05449204,  0.10140862,  0.40890266, ..., -0.03624095,
         0.32595725,  1.        ]])

In [27]:
# Create a heatmap of the correlation coefficients
data = [
    Heatmap(
        x = header,             # sites on both
        y = header,             #  axes
        z = correlationMatrix,  # correlation as color contours 
        colorscale='RdOrBl',    # light yellow-orange-red colormap
        reversescale=True       # inverse colormap order
    )
]

layout = Layout(
    xaxis = dict(title = "Feature"), 
    yaxis = dict(title = "Feature"), 
    margin= Margin(l=250),
    height = 700,
)

fig = dict(data = data, layout = layout)

iplot(fig)


Machine Learning

1) Apply KNN classification algorithm

Split the data into training and test sets


In [28]:
# Scale the dataset

X_scaled = preprocessing.scale(X)

In [29]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X_scaled, y, random_state=1)

In [30]:
# Print the dimensionality of the individual splits

print ("XTrain dimensions: ", XTrain.shape)
print ("yTrain dimensions: ", yTrain.shape)
print ("XTest dimensions: ", XTest.shape)
print ("yTest dimensions: ", yTest.shape)


('XTrain dimensions: ', (750, 61))
('yTrain dimensions: ', (750,))
('XTest dimensions: ', (250, 61))
('yTest dimensions: ', (250,))

In [31]:
# Calculate the frequency of classes in yTest

yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)


[[  0 170]
 [  1  80]]

In [32]:
# Build a KNN classifier with 3 nearest neighbors

knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(XTrain, yTrain)
yPredK3 = knn3.predict(XTest)

print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK3), 2))


('Overall Accuracy:', 0.87)

In [33]:
# Build a KNN classifier with 99 nearest neighbors

knn99 = KNeighborsClassifier(n_neighbors=99)
knn99.fit(XTrain, yTrain)
yPredK99 = knn99.predict(XTest)

print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK99), 2))


('Overall Accuracy:', 0.8)

Calculate validation metrics for your classifier


In [34]:
# Get the confusion matrix for your classifier using metrics.confusion_matrix

mat = metrics.confusion_matrix(yTest, yPredK3) 
print (mat)


[[155  15]
 [ 18  62]]

In [35]:
# Report the metrics using metrics.classification_report

print (metrics.classification_report(yTest, yPredK3))
print ("accuracy: ", round(metrics.accuracy_score(yTest, yPredK3), 2))


             precision    recall  f1-score   support

          0       0.90      0.91      0.90       170
          1       0.81      0.78      0.79        80

avg / total       0.87      0.87      0.87       250

('accuracy: ', 0.87)

Plot the decision boundaries for di€fferent models


In [36]:
# Check the arguments of the function
help(visplots.knnDecisionPlot)

# Visualise the boundaries
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3)


Help on function knnDecisionPlot in module visplots:

knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors, weights='uniform')


In [ ]:
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 99)

Di€fferent weight configurations


In [ ]:
# Build the classifier with two pre-defined parameters (n_neighbors and weights)

# Visualise the boundaries of a KNN model with weights equal to "distance"

knnW3 = KNeighborsClassifier(n_neighbors=3, weights='distance')
knnW3.fit(XTrain, yTrain)
predictedW3 = knnW3.predict(XTest)

print (metrics.classification_report(yTest, predictedW3))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predictedW3), 2))

visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3, weights="distance")

K-fold cross-validation


In [ ]:
# Implement cross-validation for knn3 

knn3scores = cross_val_score(knn3, XTrain, yTrain, cv = 5)
print (knn3scores)
print ("Mean of scores KNN3", knn3scores.mean())

Grid search on hyperparameters


In [ ]:
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters

n_neighbors = np.arange(1, 51, 2)  # odd numbers of neighbors used
weights     = ['uniform','distance']
parameters  = [{'n_neighbors': n_neighbors, 'weights': weights}]

gridCV = GridSearchCV(KNeighborsClassifier(), parameters, cv=10, n_jobs=-1)
gridCV.fit(XTrain, yTrain)

# Print the optimal parameters

bestNeighbors = gridCV.best_params_['n_neighbors'] 
bestWeight    = gridCV.best_params_['weights']

print ("Best parameters: n_neighbors=", bestNeighbors, "and weight=", bestWeight)

In [ ]:
# grid_scores_ contains parameter settings and scores
scores = np.zeros((len(n_neighbors), len(weights)))

for score in gridCV.grid_scores_:
    ne = score[0]['n_neighbors']
    i = np.argmax(n_neighbors == ne)
    j = 0 if (score[0]['weights'] == 'uniform') else 1
    scores[i,j] = score[1]

In [ ]:
# Visualise the grid search results using a heatmap

# Make a heatmap with the performance
data = [
    Heatmap(
        x = n_neighbors,
        y = weights,
        z = scores.T,
        colorscale='Jet',
        reversescale=True,
        colorbar = dict(
            title = "Classification Accuracy",
            len = 5,
            nticks=10
        )
    )
]

layout = Layout(
    xaxis = dict(title = "Number of K nearest neighbors", tickvals = n_neighbors),
    yaxis = dict(title = "Weights"),
    height= 230,
)

fig = dict(data = data, layout = layout)

iplot(fig)

In [ ]:
# Build the classifier using the optimal parameters detected by grid search 

knn = KNeighborsClassifier(n_neighbors = bestNeighbors, weights = bestWeight)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)

print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))

Randomized search on hyperparameters


In [ ]:
# Conduct a randomised search on hyperparameters

parameters = {'n_neighbors': randint(1,200)}

randomCV = RandomizedSearchCV(KNeighborsClassifier(), 
                              param_distributions=parameters, n_iter=20)
randomCV.fit(XTrain, yTrain)

# Print the optimal n_neighbors detected by randomised search
bestNeighbors = randomCV.best_params_['n_neighbors']
print("Best parameters: n_neighbors=", bestNeighbors)

In [ ]:
neighbor = [score_tuple[0]['n_neighbors'] for score_tuple in randomCV.grid_scores_] 
result   = [score_tuple[1] for score_tuple in randomCV.grid_scores_]

In [ ]:
# Visualise the randomised search results using a scatterplot

data = [
    Scatter(
        x = neighbor,
        y = result,
        mode = "markers"
    )
]

layout = Layout(
    xaxis = dict(title = "Number of k nearest neighbors"), 
    yaxis = dict(title = "Classification Accuracy"),
    height = 500, 
    width = 900,
)

fig = dict(data = data, layout = layout)

iplot(fig)

In [ ]:
# Build the classifier using the optimal parameters detected by randomised search

knn = KNeighborsClassifier(n_neighbors=bestNeighbors)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)

print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))

2) Decision Tree


In [ ]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)

In [ ]:
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)

In [ ]:
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(XTrain, yTrain)
predDT = dtc.predict(XTest)

print (metrics.classification_report(yTest, predDT))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predDT),2))

visplots.dtDecisionPlot(XTrain, yTrain, XTest, yTest, header, max_depth=3)

3) Random Forests


In [ ]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)

In [ ]:
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)

In [ ]:
# Build a Random Forest classifier with 100 decision trees

rf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=4)
rf.fit(XTrain, yTrain)
predRF = rf.predict(XTest)

print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))

Visualising the RF accuracy


In [ ]:
# Visualise the average accuracy 

visplots.rfAvgAcc(rfModel = rf, XTest =XTest, yTest=yTest)

Feature Importance


In [ ]:
# Display the importance of the features in a barplot

importance = rf.feature_importances_
names = header[0:10]

data = [
    Bar(
        x = importance,
        y = names,
        orientation = 'h',
    )
]

layout = Layout(
    xaxis = dict(title = "Importance of features"),
    yaxis = dict(title = "Features"),
    width = 800,
    margin=Margin(
        l=250,
        r=50,
        b=100,
        t=50,
        pad=4
    ),
)

fig = dict(data = data, layout = layout)

iplot(fig)

Boundary visualisation


In [ ]:
# Check the arguments of the function
help(visplots.rfDecisionPlot)

# Visualise the boundaries
visplots.rfDecisionPlot(XTrain, yTrain, XTest, yTest, header)

Tuning Random Forests


In [ ]:
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters

# Parameters you can investigate include:
n_estimators = np.arange(1, 30, 5)
max_depth    = np.arange(1, 100, 5)

# Also, you may choose any of the following
# max_features = [1, 3, 10]
# min_samples_split = [1, 3, 10]
# min_samples_leaf  = [1, 3, 10]
# bootstrap = [True, False]
# criterion = ["gini", "entropy"]

parameters   = [{'n_estimators': n_estimators, 'max_depth': max_depth}]

gridCV = GridSearchCV(RandomForestClassifier(), param_grid=parameters, cv=10, n_jobs=4)
gridCV.fit(XTrain, yTrain)


# Print the optimal parameters

best_n_estim      = gridCV.best_params_['n_estimators']
best_max_depth    = gridCV.best_params_['max_depth']

print ("Best parameters: n_estimators=", best_n_estim,", max_depth=", best_max_depth)

In [ ]:
# Build the classifier using the optimal parameters detected by grid search

clfRDF = RandomForestClassifier(n_estimators=best_n_estim, max_depth=best_max_depth)
clfRDF.fit(XTrain, yTrain)
predRF = clfRDF.predict(XTest)

print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))

In [ ]:
# Create a heatmap like the one you made when you applied GridSearchCV to KNN

# reorganisig the scores in a matrix
scores = np.zeros((len(n_estimators), len(max_depth)))

for score in gridCV.grid_scores_:
    ne = score[0]['n_estimators']
    md = score[0]['max_depth']
    i = np.argmax(n_estimators == ne)
    j = np.argmax(max_depth == md)
    scores[i,j] = score[1]

# Make a heatmap with the performance
data = [
    Heatmap(
        x = n_estimators,
        y = max_depth,
        z = scores.T,
        colorscale='Jet',
        reversescale=True,
        colorbar = dict(
            title = "Classification Accuracy",
            nticks=10
        )
    )
]

layout = Layout(
    xaxis = dict(title = "Number of estimators"),
    yaxis = dict(title = "Max Depth", tickvals = max_depth ),
    height = 800,
)

fig = dict(data = data, layout = layout)

iplot(fig)

In [ ]:
submit_dataset_path = "./processed_data/spam_submit.csv"
submit_dataset = pd.read_csv(submit_dataset_path, sep=",", )
submit_dataset.head()

In [ ]:
cols = submit_dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
submit_dataset = submit_dataset[cols]
submit_dataset.head()

In [ ]:
npArray = np.array(submit_dataset)
submit_X = npArray[:,:-1].astype(float)
submit_y = npArray[:,-1]

In [ ]:
le = preprocessing.LabelEncoder()
submit_y  = le.fit_transform(submit_y)

In [ ]:
predRF = clfRDF.predict(submit_X)

predRF

# print (metrics.classification_report(submit_y, predRF))
# print ("Overall Accuracy:", round(metrics.accuracy_score(submit_y, predRF),2))

Parallelisation


In [ ]:
# Change the value of n_jobs and estimate the excution time of fit

import timeit

n_jobs_to_try = np.arange(1,9)
elapsed_times = []

for jobs in n_jobs_to_try:
    start_time = timeit.default_timer()
    rf = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=jobs)
    rf.fit(XTrain, yTrain)
    elapsed = timeit.default_timer() - start_time
    elapsed_times.append(elapsed)

In [ ]:
data = [
    Bar(
        x=n_jobs_to_try,
        y=elapsed_times
    )
]

fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))

iplot(fig)

In [ ]:
data = [
    Scatter(
        x=n_jobs_to_try,
        y=elapsed_times,
        mode='markers+lines',
    )
]

fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))

iplot(fig)

4) Support Vector Machines (SVMs)

Linear SVMs


In [ ]:
# Load libraries

from sklearn.svm import SVC
import matplotlib.pyplot as plt

import matplotlib_visplots

%matplotlib inline

In [ ]:
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)

In [ ]:
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)

In [ ]:
linearSVM = SVC(kernel='linear', C=1.0)
linearSVM.fit(XTrain, yTrain)
yPredLinear = linearSVM.predict(XTest)

print metrics.classification_report(yTest, yPredLinear)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredLinear),2)

In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)

### Non-linear (RBF) SVMs

matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'linear')

Non-linear (RBF) SVMs


In [ ]:
rbfSVM = SVC(kernel='rbf', C=1.0, gamma=0.0)
rbfSVM.fit(XTrain, yTrain)
yPredRBF = rbfSVM.predict(XTest)

print metrics.classification_report(yTest, yPredRBF)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredRBF),2)

In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)

matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'rbf')

Hyperparameter Tuning for non-linear SVMs


In [ ]:
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
g_range = 2. ** np.arange(-15, 5, step=2)
C_range = 2. ** np.arange(-5, 15, step=2)

parameters = [{'gamma': g_range, 'C': C_range}] 

grid = GridSearchCV(SVC(), parameters, cv= 10)  
grid.fit(XTrain, yTrain)

bestG = grid.best_params_['gamma']
bestC = grid.best_params_['C']
print "The best parameters are: gamma=", np.log2(bestG), " and Cost=", np.log2(bestC)

In [ ]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(g_range))

plt.figure(figsize=(10, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(g_range)), np.log2(g_range))
plt.yticks(np.arange(len(C_range)), np.log2(C_range))
plt.xlabel('gamma (log2)')
plt.ylabel('Cost (log2)')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)
plt.show()

5) Logistic Regression


In [ ]:
from sklearn.linear_model import LogisticRegression

In [ ]:
l_regression = LogisticRegression()
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)

print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)

In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.logregDecisionPlot)

matplotlib_visplots.logregDecisionPlot(XTrain, yTrain, XTest, yTest)

Tuning Logistic Regression


In [ ]:
# Define the parameters to be optimised and their values/ranges
# Range for pen and C hyperparameters
pen = ['l1','l2']
C_range = 2. ** np.arange(-5, 15, step=2)

parameters = [{'C': C_range, 'penalty': pen}]

grid = GridSearchCV(LogisticRegression(), parameters, cv= 10)
grid.fit(XTrain, yTrain)

bestC = grid.best_params_['C']
bestP = grid.best_params_['penalty']
print "The best parameters are: cost=", bestC , " and penalty=", bestP

In [ ]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(pen), len(C_range))
scores = np.transpose(scores)

plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(pen)), pen)
plt.yticks(np.arange(len(C_range)), C_range)
plt.xlabel('penalisation norm')
plt.ylabel('inv regularisation strength')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)

plt.show()

In [ ]:
l_regression = LogisticRegression(C=bestC, penalty=bestP)
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)

print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)

6) Neural Networks


In [ ]:
from multilayer_perceptron import multilayer_perceptron

In [ ]:
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(activation='logistic', 
                                                            hidden_layer_sizes=2, learning_rate_init=.5)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)

print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)

In [ ]:
# Check the arguments of the function
help(matplotlib_visplots.nnDecisionPlot)

matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, 2, .5)
matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, (2,3,6), .5)

Tuning Neural Nets


In [ ]:
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
layer_size_range = [(3,2),(10,10),(2,2,2),10,5] # different networks shapes
learning_rate_range = np.linspace(.1,1,3)

parameters = [{'hidden_layer_sizes': layer_size_range, 'learning_rate_init': learning_rate_range}]

grid = GridSearchCV(multilayer_perceptron.MultilayerPerceptronClassifier(), parameters, cv= 10)
grid.fit(XTrain, yTrain)

best_size    = grid.best_params_['hidden_layer_sizes']
best_best_lr = grid.best_params_['learning_rate_init']
print "The best parameters are: hidden_layer_sizes=", best_size, " and learning_rate_init=", best_best_lr

In [ ]:
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(layer_size_range), len(learning_rate_range))
scores = np.transpose(scores)

plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(layer_size_range)), layer_size_range)
plt.yticks(np.arange(len(learning_rate_range)), learning_rate_range)
plt.xlabel('hidden layer topology')
plt.ylabel('learning rate')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)

plt.show()

In [ ]:
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(hidden_layer_sizes=best_size, learning_rate_init=best_best_lr)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)

print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)

In [ ]: