Competition Testing

Load the required libraries



In [1]:

    
import scipy
import numpy as np
import pandas as pd
import plotly.plotly as py

import visplots

from plotly.graph_objs import *
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
from sklearn import preprocessing, metrics
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.grid_search import GridSearchCV, RandomizedSearchCV
from scipy.stats.distributions import randint

init_notebook_mode()

print("libraries all imported, ready to go")









    











    



libraries all imported, ready to go

Importing the data



In [4]:

    
# Import the data and explore the first few rows
dataset_path = "spam_dataset.csv"
dataset = pd.read_csv(dataset_path, sep=",")
dataset_name = "Spam"

dataset.head()









    Out[4]:






  
    
      
      email_id
      is_spam
      word_freq_will
      word_freq_original
      word_freq_415
      word_freq_mail
      char_freq_#
      char_freq_$
      word_freq_internet
      word_freq_edu
      ...
      word_freq_receive
      word_freq_000
      capital_run_length_average
      word_freq_address
      word_freq_george
      word_freq_cs
      word_freq_random
      word_freq_conference
      word_freq_technology
      char_freq_(
    
  
  
    
      0
      3628
      no
      0.00
      0
      0
      0.00
      0
      0
      0.0
      0
      ...
      0.00
      0
      2.000
      0.00
      0.00
      0
      0
      0
      0
      0.000
    
    
      1
      63
      no
      0.00
      0
      0
      0.49
      0
      0
      0.0
      0
      ...
      0.00
      0
      2.824
      0.00
      0.99
      0
      0
      0
      0
      0.062
    
    
      2
      1540
      no
      1.31
      0
      0
      0.00
      0
      0
      0.0
      0
      ...
      0.00
      0
      2.176
      0.00
      0.00
      0
      0
      0
      0
      0.431
    
    
      3
      4460
      yes
      0.75
      0
      0
      0.50
      0
      0
      0.5
      0
      ...
      0.25
      0
      1.023
      0.75
      0.00
      0
      0
      0
      0
      0.180
    
    
      4
      2771
      no
      0.00
      0
      0
      0.00
      0
      0
      0.0
      0
      ...
      0.00
      0
      1.500
      0.00
      1.56
      0
      0
      0
      0
      0.180
    
  

5 rows × 63 columns

Split the data into input features, X, and outputs, y



In [5]:

    
cols = dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
dataset = dataset[cols]
dataset.head()









    Out[5]:






  
    
      
      word_freq_will
      word_freq_original
      word_freq_415
      word_freq_mail
      char_freq_#
      char_freq_$
      word_freq_internet
      word_freq_edu
      word_freq_hp
      word_freq_lab
      ...
      word_freq_000
      capital_run_length_average
      word_freq_address
      word_freq_george
      word_freq_cs
      word_freq_random
      word_freq_conference
      word_freq_technology
      char_freq_(
      is_spam
    
  
  
    
      0
      0.00
      0
      0
      0.00
      0
      0
      0.0
      0
      0
      0
      ...
      0
      2.000
      0.00
      0.00
      0
      0
      0
      0
      0.000
      no
    
    
      1
      0.00
      0
      0
      0.49
      0
      0
      0.0
      0
      0
      0
      ...
      0
      2.824
      0.00
      0.99
      0
      0
      0
      0
      0.062
      no
    
    
      2
      1.31
      0
      0
      0.00
      0
      0
      0.0
      0
      0
      0
      ...
      0
      2.176
      0.00
      0.00
      0
      0
      0
      0
      0.431
      no
    
    
      3
      0.75
      0
      0
      0.50
      0
      0
      0.5
      0
      0
      0
      ...
      0
      1.023
      0.75
      0.00
      0
      0
      0
      0
      0.180
      yes
    
    
      4
      0.00
      0
      0
      0.00
      0
      0
      0.0
      0
      0
      0
      ...
      0
      1.500
      0.00
      1.56
      0
      0
      0
      0
      0.180
      no
    
  

5 rows × 62 columns



In [6]:

    
dataset = dataset.dropna()



In [8]:

    
# Convert to numpy array and check the dimensionality
npArray = np.array(dataset)
print(npArray.shape)



In [12]:

    
header = dataset.columns.values
header[1:10]









    Out[12]:





array(['word_freq_original', 'word_freq_415', 'word_freq_mail',
       'char_freq_#', 'char_freq_$', 'word_freq_internet', 'word_freq_edu',
       'word_freq_hp', 'word_freq_lab'], dtype=object)



In [13]:

    
# Split to input matrix X and class vector y
X = npArray[:,:-1].astype(float)
y = npArray[:,-1]

# Print the dimensions of X and y

print ("X dimensions:", X.shape)
print ("y dimensions:", y.shape)









    



('X dimensions:', (1000, 61))
('y dimensions:', (1000,))

Exploratory Data Analysis

Plot y frequencies



In [14]:

    
# Print the y frequencies
yFreq = scipy.stats.itemfreq(y)
print(yFreq)









    



[['no' 668]
 ['yes' 332]]



In [15]:

    
# Copy out the class labels (useful for plotting)
class_lables = [yFreq[0][0], yFreq[1][0]]
print(class_lables)









    



['no', 'yes']



In [16]:

    
# Convert the categorical to numeric values, and print the y frequencies

le = preprocessing.LabelEncoder()
y  = le.fit_transform(y)

yFreq = scipy.stats.itemfreq(y)
print(yFreq)



In [17]:

    
# Display the y frequencies in a barplot with Plotly

# (1) Create the Data object
data = [
    Bar(
        x = [class_lables[0], class_lables[1]],
        y = [yFreq[0][1], yFreq[1][1]],
        marker = dict(color=['blue','red'])
    )
]

# (2) Create a Layout object
layout = Layout(
    xaxis = dict(title = dataset_name),
    yaxis = dict(title = "Count"),
    width = 500
)

# (3) Create a Figure object
fig = dict(data = data, layout = layout)

# (4) Plot
iplot(fig)

Data distributions



In [18]:

    
dataset.describe()









    Out[18]:






  
    
      
      word_freq_will
      word_freq_original
      word_freq_415
      word_freq_mail
      char_freq_#
      char_freq_$
      word_freq_internet
      word_freq_edu
      word_freq_hp
      word_freq_lab
      ...
      word_freq_receive
      word_freq_000
      capital_run_length_average
      word_freq_address
      word_freq_george
      word_freq_cs
      word_freq_random
      word_freq_conference
      word_freq_technology
      char_freq_(
    
  
  
    
      count
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000.00000
      1000.000000
      1000.000000
      ...
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000.000000
      1000
      1000
      1000.000000
      1000.000000
      1000.000000
    
    
      mean
      0.537950
      0.038370
      0.054690
      0.189840
      0.022792
      0.066014
      0.073210
      0.18100
      0.611970
      0.118610
      ...
      0.051040
      0.081300
      4.857610
      0.149980
      0.775740
      0
      0
      0.036690
      0.125580
      0.144783
    
    
      std
      0.831747
      0.173041
      0.365678
      0.496022
      0.109007
      0.248239
      0.270431
      0.86285
      1.734907
      0.746169
      ...
      0.192314
      0.358906
      30.226395
      0.955315
      3.509211
      0
      0
      0.268434
      0.449092
      0.232423
    
    
      min
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.00000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0
      0
      0.000000
      0.000000
      0.000000
    
    
      25%
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.00000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      1.541000
      0.000000
      0.000000
      0
      0
      0.000000
      0.000000
      0.000000
    
    
      50%
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.00000
      0.000000
      0.000000
      ...
      0.000000
      0.000000
      2.219500
      0.000000
      0.000000
      0
      0
      0.000000
      0.000000
      0.072000
    
    
      75%
      0.820000
      0.000000
      0.000000
      0.000000
      0.000000
      0.016000
      0.000000
      0.00000
      0.315000
      0.000000
      ...
      0.000000
      0.000000
      3.396500
      0.000000
      0.000000
      0
      0
      0.000000
      0.000000
      0.195000
    
    
      max
      6.250000
      2.220000
      4.760000
      5.260000
      1.410000
      4.017000
      3.570000
      10.00000
      20.830000
      14.280000
      ...
      2.000000
      5.450000
      667.000000
      14.280000
      33.330000
      0
      0
      5.000000
      4.760000
      2.941000
    
  

8 rows × 61 columns



In [19]:

    
# Create a boxplot of the raw data

nrow, ncol = X.shape

data = [
    Box(
        y = X[:,i],        # values to be used for box plot
        name = header[i],  # label (on hover and x-axis)
        marker = dict(color = "purple"),
    ) for i in range(ncol)
]

layout = Layout(
    xaxis = dict(title = "Feature"),
    yaxis = dict(title = "Value"),
    showlegend=False,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [20]:

    
# Alternatively

data = [
    Box(
        y = X[:,i],     
        name = header[i],  
        boxpoints='all',
        jitter=0.4,
        whiskerwidth=0.2,
        marker=dict(
            size=2,
        ),
        line=dict(width=1),
        boxmean='sd'
    ) for i in range(X.shape[1])
]

layout = Layout(
    xaxis = dict(title = "Feature", tickangle=40), 
    yaxis = dict(title = "Value"), 
    showlegend=False,
    height=700, 
    margin=Margin(b=170, t=50), 
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [21]:

    
# Create a boxplot of the scaled data

X_scaled = preprocessing.scale(X)

nrow, ncol = X.shape

data = [
    Box(
        y = X_scaled[:,i],        # values to be used for box plot
        name = header[i],  # label (on hover and x-axis)
        marker = dict(color = "purple"),
    ) for i in range(ncol)
]

layout = Layout(
    xaxis = dict(title = "Feature"),
    yaxis = dict(title = "Value"),
    showlegend=False,
)

fig = dict(data = data, layout = layout)

iplot(fig)

Plot pairs of input features X as scatter plots



In [22]:

    
# Create a scatter plot of the first two features

f1 = 0
f2 = 3 

data = [
    Scatter(
        x = X[:, f1],
        y = X[:, f2],
        mode = "markers"
    )
]

layout = Layout(
    xaxis = dict(title = header[f1]),
    yaxis = dict(title = header[f2])
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [23]:

    
# Create an enhanced scatter plot of the first two features

f1 = 0
f2 = 3

# Low quality (class "1") represented with red x
trace1 = Scatter(
    x = X[y == 1, f1],
    y = X[y == 1, f2],
    mode = 'markers',
    name = 'Low Quality ("1")',
    marker = dict(
        color  = 'red',
        symbol = 'x'
    )
)

# High quality (class "0") represented with blue circles
trace2 = Scatter(
    x = X[y == 0, f1],
    y = X[y == 0, f2],
    mode = 'markers',
    name = 'High Quality ("0")',
    marker = dict(
        color  = 'blue',
        symbol = 'circle'
    )
)

layout = Layout(
    xaxis = dict(title = header[f1], type='log'),
    yaxis = dict(title = header[f2], type='log'),
    height= 600,
)

fig = dict(data = [trace1, trace2], layout = layout)

iplot(fig)

Scatterplot Matrix



In [24]:

    
# Create a grid plot of scatterplots using a combination of features

from plotly import tools

fig = tools.make_subplots(rows=4, cols=4, shared_xaxes=True, shared_yaxes=True)

for row in range(0, 4): 
    for col in range(0, 4): 
        # red x, Low quality
        trace1 = Scatter(
            x = X[y == 1, col],
            y = X[y == 1, row],
            mode = 'markers',
            marker = dict(
                color  = 'red',
                symbol = 'x',
                opacity = .5
            )
        )
        # blue circles, High quality
        trace2 = Scatter(
            x = X[y == 0, col],
            y = X[y == 0, row],
            mode = 'markers',
            marker = dict(
                color  = 'blue',
                symbol = 'circle',
                opacity = .5
            )
        )
        posX = row+1
        posY = col+1
        fig.append_trace(trace1, posX, posY)
        fig.append_trace(trace2, posX, posY)
        fig['layout']['xaxis'+str(posX)].update(title=header[row])
        fig['layout']['yaxis'+str(posY)].update(title=header[col])

fig['layout'].update(
    showlegend=False, 
    height=900, 
)

iplot(fig)









    



This is the format of your plot grid:
[ (1,1) x1,y1 ]  [ (1,2) x2,y1 ]  [ (1,3) x3,y1 ]  [ (1,4) x4,y1 ]
[ (2,1) x1,y2 ]  [ (2,2) x2,y2 ]  [ (2,3) x3,y2 ]  [ (2,4) x4,y2 ]
[ (3,1) x1,y3 ]  [ (3,2) x2,y3 ]  [ (3,3) x3,y3 ]  [ (3,4) x4,y3 ]
[ (4,1) x1,y4 ]  [ (4,2) x2,y4 ]  [ (4,3) x3,y4 ]  [ (4,4) x4,y4 ]

3D Scatter



In [25]:

    
# Create a 3D scatterplot using the first three features

f1 = 0
f2 = 1
f3 = 2

desc = dict(
    classes = [1, 0],
    colors  = ["red", "blue"],
    labels  = ['Low Quality ("1")', 'High Quality ("0")'],
    symbols = ["x", "circle"]
)

data = [
    Scatter3d(
        x = X[y == desc["classes"][i], f1],
        y = X[y == desc["classes"][i], f2],
        z = X[y == desc["classes"][i], f3],
        name = desc["labels"][i],
        mode = "markers",
        marker = dict(
            size = 2.5,
            symbol = desc["symbols"][i],
            color  = desc["colors"][i]
        )
    ) for i in range(len(desc["labels"]))
]

layout = Layout(
    scene=Scene(
        xaxis=XAxis(title=header[f1], titlefont=dict(size=11)),
        yaxis=YAxis(title=header[f2], titlefont=dict(size=11)),
        zaxis=ZAxis(title=header[f3], titlefont=dict(size=11))
    ),
    margin=Margin(l=80, r=80, b=0, t=0, pad=0, autoexpand=True),
    height= 600,
)

fig = dict(data = data, layout = layout)

iplot(fig)

Correlation Matrix



In [26]:

    
# Calculate the correlation coefficient

correlationMatrix = np.corrcoef(X_scaled, rowvar=0)
correlationMatrix









    Out[26]:





array([[ 1.        , -0.03426666, -0.03644783, ...,  0.12823004,
         0.03589436, -0.05449204],
       [-0.03426666,  1.        ,  0.13868821, ..., -0.00811994,
         0.08081546,  0.10140862],
       [-0.03644783,  0.13868821,  1.        , ..., -0.01850327,
         0.75572733,  0.40890266],
       ..., 
       [ 0.12823004, -0.00811994, -0.01850327, ...,  1.        ,
        -0.00952603, -0.03624095],
       [ 0.03589436,  0.08081546,  0.75572733, ..., -0.00952603,
         1.        ,  0.32595725],
       [-0.05449204,  0.10140862,  0.40890266, ..., -0.03624095,
         0.32595725,  1.        ]])



In [27]:

    
# Create a heatmap of the correlation coefficients
data = [
    Heatmap(
        x = header,             # sites on both
        y = header,             #  axes
        z = correlationMatrix,  # correlation as color contours 
        colorscale='RdOrBl',    # light yellow-orange-red colormap
        reversescale=True       # inverse colormap order
    )
]

layout = Layout(
    xaxis = dict(title = "Feature"), 
    yaxis = dict(title = "Feature"), 
    margin= Margin(l=250),
    height = 700,
)

fig = dict(data = data, layout = layout)

iplot(fig)

Machine Learning

1) Apply KNN classification algorithm

Split the data into training and test sets



In [28]:

    
# Scale the dataset

X_scaled = preprocessing.scale(X)



In [29]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X_scaled, y, random_state=1)



In [30]:

    
# Print the dimensionality of the individual splits

print ("XTrain dimensions: ", XTrain.shape)
print ("yTrain dimensions: ", yTrain.shape)
print ("XTest dimensions: ", XTest.shape)
print ("yTest dimensions: ", yTest.shape)









    



('XTrain dimensions: ', (750, 61))
('yTrain dimensions: ', (750,))
('XTest dimensions: ', (250, 61))
('yTest dimensions: ', (250,))



In [31]:

    
# Calculate the frequency of classes in yTest

yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [32]:

    
# Build a KNN classifier with 3 nearest neighbors

knn3 = KNeighborsClassifier(n_neighbors=3)
knn3.fit(XTrain, yTrain)
yPredK3 = knn3.predict(XTest)

print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK3), 2))









    



('Overall Accuracy:', 0.87)



In [33]:

    
# Build a KNN classifier with 99 nearest neighbors

knn99 = KNeighborsClassifier(n_neighbors=99)
knn99.fit(XTrain, yTrain)
yPredK99 = knn99.predict(XTest)

print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredK99), 2))









    



('Overall Accuracy:', 0.8)

Calculate validation metrics for your classifier



In [34]:

    
# Get the confusion matrix for your classifier using metrics.confusion_matrix

mat = metrics.confusion_matrix(yTest, yPredK3) 
print (mat)



In [35]:

    
# Report the metrics using metrics.classification_report

print (metrics.classification_report(yTest, yPredK3))
print ("accuracy: ", round(metrics.accuracy_score(yTest, yPredK3), 2))









    



             precision    recall  f1-score   support

          0       0.90      0.91      0.90       170
          1       0.81      0.78      0.79        80

avg / total       0.87      0.87      0.87       250

('accuracy: ', 0.87)

Plot the decision boundaries for di€fferent models



In [36]:

    
# Check the arguments of the function
help(visplots.knnDecisionPlot)

# Visualise the boundaries
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3)









    



Help on function knnDecisionPlot in module visplots:

knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors, weights='uniform')



In [ ]:

    
visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 99)

Di€fferent weight configurations



In [ ]:

    
# Build the classifier with two pre-defined parameters (n_neighbors and weights)

# Visualise the boundaries of a KNN model with weights equal to "distance"

knnW3 = KNeighborsClassifier(n_neighbors=3, weights='distance')
knnW3.fit(XTrain, yTrain)
predictedW3 = knnW3.predict(XTest)

print (metrics.classification_report(yTest, predictedW3))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predictedW3), 2))

visplots.knnDecisionPlot(XTrain, yTrain, XTest, yTest, header, n_neighbors= 3, weights="distance")

K-fold cross-validation



In [ ]:

    
# Implement cross-validation for knn3 

knn3scores = cross_val_score(knn3, XTrain, yTrain, cv = 5)
print (knn3scores)
print ("Mean of scores KNN3", knn3scores.mean())

Grid search on hyperparameters



In [ ]:

    
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters

n_neighbors = np.arange(1, 51, 2)  # odd numbers of neighbors used
weights     = ['uniform','distance']
parameters  = [{'n_neighbors': n_neighbors, 'weights': weights}]

gridCV = GridSearchCV(KNeighborsClassifier(), parameters, cv=10, n_jobs=-1)
gridCV.fit(XTrain, yTrain)

# Print the optimal parameters

bestNeighbors = gridCV.best_params_['n_neighbors'] 
bestWeight    = gridCV.best_params_['weights']

print ("Best parameters: n_neighbors=", bestNeighbors, "and weight=", bestWeight)



In [ ]:

    
# grid_scores_ contains parameter settings and scores
scores = np.zeros((len(n_neighbors), len(weights)))

for score in gridCV.grid_scores_:
    ne = score[0]['n_neighbors']
    i = np.argmax(n_neighbors == ne)
    j = 0 if (score[0]['weights'] == 'uniform') else 1
    scores[i,j] = score[1]



In [ ]:

    
# Visualise the grid search results using a heatmap

# Make a heatmap with the performance
data = [
    Heatmap(
        x = n_neighbors,
        y = weights,
        z = scores.T,
        colorscale='Jet',
        reversescale=True,
        colorbar = dict(
            title = "Classification Accuracy",
            len = 5,
            nticks=10
        )
    )
]

layout = Layout(
    xaxis = dict(title = "Number of K nearest neighbors", tickvals = n_neighbors),
    yaxis = dict(title = "Weights"),
    height= 230,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
# Build the classifier using the optimal parameters detected by grid search 

knn = KNeighborsClassifier(n_neighbors = bestNeighbors, weights = bestWeight)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)

print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))

Randomized search on hyperparameters



In [ ]:

    
# Conduct a randomised search on hyperparameters

parameters = {'n_neighbors': randint(1,200)}

randomCV = RandomizedSearchCV(KNeighborsClassifier(), 
                              param_distributions=parameters, n_iter=20)
randomCV.fit(XTrain, yTrain)

# Print the optimal n_neighbors detected by randomised search
bestNeighbors = randomCV.best_params_['n_neighbors']
print("Best parameters: n_neighbors=", bestNeighbors)



In [ ]:

    
neighbor = [score_tuple[0]['n_neighbors'] for score_tuple in randomCV.grid_scores_] 
result   = [score_tuple[1] for score_tuple in randomCV.grid_scores_]



In [ ]:

    
# Visualise the randomised search results using a scatterplot

data = [
    Scatter(
        x = neighbor,
        y = result,
        mode = "markers"
    )
]

layout = Layout(
    xaxis = dict(title = "Number of k nearest neighbors"), 
    yaxis = dict(title = "Classification Accuracy"),
    height = 500, 
    width = 900,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
# Build the classifier using the optimal parameters detected by randomised search

knn = KNeighborsClassifier(n_neighbors=bestNeighbors)
knn.fit(XTrain, yTrain)
yPredKnn = knn.predict(XTest)

print (metrics.classification_report(yTest, yPredKnn))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredKnn), 2))

2) Decision Tree



In [ ]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)



In [ ]:

    
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [ ]:

    
dtc = DecisionTreeClassifier(max_depth=3)
dtc.fit(XTrain, yTrain)
predDT = dtc.predict(XTest)

print (metrics.classification_report(yTest, predDT))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predDT),2))

visplots.dtDecisionPlot(XTrain, yTrain, XTest, yTest, header, max_depth=3)

3) Random Forests



In [ ]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)



In [ ]:

    
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [ ]:

    
# Build a Random Forest classifier with 100 decision trees

rf = RandomForestClassifier(n_estimators=100, random_state=0, n_jobs=4)
rf.fit(XTrain, yTrain)
predRF = rf.predict(XTest)

print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))

Visualising the RF accuracy



In [ ]:

    
# Visualise the average accuracy 

visplots.rfAvgAcc(rfModel = rf, XTest =XTest, yTest=yTest)

Feature Importance



In [ ]:

    
# Display the importance of the features in a barplot

importance = rf.feature_importances_
names = header[0:10]

data = [
    Bar(
        x = importance,
        y = names,
        orientation = 'h',
    )
]

layout = Layout(
    xaxis = dict(title = "Importance of features"),
    yaxis = dict(title = "Features"),
    width = 800,
    margin=Margin(
        l=250,
        r=50,
        b=100,
        t=50,
        pad=4
    ),
)

fig = dict(data = data, layout = layout)

iplot(fig)

Boundary visualisation



In [ ]:

    
# Check the arguments of the function
help(visplots.rfDecisionPlot)

# Visualise the boundaries
visplots.rfDecisionPlot(XTrain, yTrain, XTest, yTest, header)

Tuning Random Forests



In [ ]:

    
# Conduct a grid search with 10-fold cross-validation using the dictionary of parameters

# Parameters you can investigate include:
n_estimators = np.arange(1, 30, 5)
max_depth    = np.arange(1, 100, 5)

# Also, you may choose any of the following
# max_features = [1, 3, 10]
# min_samples_split = [1, 3, 10]
# min_samples_leaf  = [1, 3, 10]
# bootstrap = [True, False]
# criterion = ["gini", "entropy"]

parameters   = [{'n_estimators': n_estimators, 'max_depth': max_depth}]

gridCV = GridSearchCV(RandomForestClassifier(), param_grid=parameters, cv=10, n_jobs=4)
gridCV.fit(XTrain, yTrain)


# Print the optimal parameters

best_n_estim      = gridCV.best_params_['n_estimators']
best_max_depth    = gridCV.best_params_['max_depth']

print ("Best parameters: n_estimators=", best_n_estim,", max_depth=", best_max_depth)



In [ ]:

    
# Build the classifier using the optimal parameters detected by grid search

clfRDF = RandomForestClassifier(n_estimators=best_n_estim, max_depth=best_max_depth)
clfRDF.fit(XTrain, yTrain)
predRF = clfRDF.predict(XTest)

print (metrics.classification_report(yTest, predRF))
print ("Overall Accuracy:", round(metrics.accuracy_score(yTest, predRF),2))

Visualise the scores of the grid search



In [ ]:

    
# Create a heatmap like the one you made when you applied GridSearchCV to KNN

# reorganisig the scores in a matrix
scores = np.zeros((len(n_estimators), len(max_depth)))

for score in gridCV.grid_scores_:
    ne = score[0]['n_estimators']
    md = score[0]['max_depth']
    i = np.argmax(n_estimators == ne)
    j = np.argmax(max_depth == md)
    scores[i,j] = score[1]

# Make a heatmap with the performance
data = [
    Heatmap(
        x = n_estimators,
        y = max_depth,
        z = scores.T,
        colorscale='Jet',
        reversescale=True,
        colorbar = dict(
            title = "Classification Accuracy",
            nticks=10
        )
    )
]

layout = Layout(
    xaxis = dict(title = "Number of estimators"),
    yaxis = dict(title = "Max Depth", tickvals = max_depth ),
    height = 800,
)

fig = dict(data = data, layout = layout)

iplot(fig)



In [ ]:

    
submit_dataset_path = "./processed_data/spam_submit.csv"
submit_dataset = pd.read_csv(submit_dataset_path, sep=",", )
submit_dataset.head()



In [ ]:

    
cols = submit_dataset.columns.tolist()
cols = cols[2:] + [cols[1]]
submit_dataset = submit_dataset[cols]
submit_dataset.head()



In [ ]:

    
npArray = np.array(submit_dataset)
submit_X = npArray[:,:-1].astype(float)
submit_y = npArray[:,-1]



In [ ]:

    
le = preprocessing.LabelEncoder()
submit_y  = le.fit_transform(submit_y)



In [ ]:

    
predRF = clfRDF.predict(submit_X)

predRF

# print (metrics.classification_report(submit_y, predRF))
# print ("Overall Accuracy:", round(metrics.accuracy_score(submit_y, predRF),2))

Parallelisation



In [ ]:

    
# Change the value of n_jobs and estimate the excution time of fit

import timeit

n_jobs_to_try = np.arange(1,9)
elapsed_times = []

for jobs in n_jobs_to_try:
    start_time = timeit.default_timer()
    rf = RandomForestClassifier(n_estimators=500, random_state=0, n_jobs=jobs)
    rf.fit(XTrain, yTrain)
    elapsed = timeit.default_timer() - start_time
    elapsed_times.append(elapsed)



In [ ]:

    
data = [
    Bar(
        x=n_jobs_to_try,
        y=elapsed_times
    )
]

fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))

iplot(fig)



In [ ]:

    
data = [
    Scatter(
        x=n_jobs_to_try,
        y=elapsed_times,
        mode='markers+lines',
    )
]

fig = dict(data=data, layout=Layout(yaxis=dict(title='seconds')))

iplot(fig)

4) Support Vector Machines (SVMs)

Linear SVMs



In [ ]:

    
# Load libraries

from sklearn.svm import SVC
import matplotlib.pyplot as plt

import matplotlib_visplots

%matplotlib inline



In [ ]:

    
# Split into training and test sets
XTrain, XTest, yTrain, yTest = train_test_split(X, y, random_state=1)



In [ ]:

    
# Calculate the frequency of classes in yTest
yFreq = scipy.stats.itemfreq(yTest)
print (yFreq)



In [ ]:

    
linearSVM = SVC(kernel='linear', C=1.0)
linearSVM.fit(XTrain, yTrain)
yPredLinear = linearSVM.predict(XTest)

print metrics.classification_report(yTest, yPredLinear)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredLinear),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)

### Non-linear (RBF) SVMs

matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'linear')

Non-linear (RBF) SVMs



In [ ]:

    
rbfSVM = SVC(kernel='rbf', C=1.0, gamma=0.0)
rbfSVM.fit(XTrain, yTrain)
yPredRBF = rbfSVM.predict(XTest)

print metrics.classification_report(yTest, yPredRBF)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, yPredRBF),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.svmDecisionPlot)

matplotlib_visplots.svmDecisionPlot(XTrain, yTrain, XTest, yTest, 'rbf')

Hyperparameter Tuning for non-linear SVMs



In [ ]:

    
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
g_range = 2. ** np.arange(-15, 5, step=2)
C_range = 2. ** np.arange(-5, 15, step=2)

parameters = [{'gamma': g_range, 'C': C_range}] 

grid = GridSearchCV(SVC(), parameters, cv= 10)  
grid.fit(XTrain, yTrain)

bestG = grid.best_params_['gamma']
bestC = grid.best_params_['C']
print "The best parameters are: gamma=", np.log2(bestG), " and Cost=", np.log2(bestC)



In [ ]:

    
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(C_range), len(g_range))

plt.figure(figsize=(10, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(g_range)), np.log2(g_range))
plt.yticks(np.arange(len(C_range)), np.log2(C_range))
plt.xlabel('gamma (log2)')
plt.ylabel('Cost (log2)')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)
plt.show()

5) Logistic Regression



In [ ]:

    
from sklearn.linear_model import LogisticRegression



In [ ]:

    
l_regression = LogisticRegression()
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)

print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.logregDecisionPlot)

matplotlib_visplots.logregDecisionPlot(XTrain, yTrain, XTest, yTest)

Tuning Logistic Regression



In [ ]:

    
# Define the parameters to be optimised and their values/ranges
# Range for pen and C hyperparameters
pen = ['l1','l2']
C_range = 2. ** np.arange(-5, 15, step=2)

parameters = [{'C': C_range, 'penalty': pen}]

grid = GridSearchCV(LogisticRegression(), parameters, cv= 10)
grid.fit(XTrain, yTrain)

bestC = grid.best_params_['C']
bestP = grid.best_params_['penalty']
print "The best parameters are: cost=", bestC , " and penalty=", bestP



In [ ]:

    
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(pen), len(C_range))
scores = np.transpose(scores)

plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(pen)), pen)
plt.yticks(np.arange(len(C_range)), C_range)
plt.xlabel('penalisation norm')
plt.ylabel('inv regularisation strength')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)

plt.show()



In [ ]:

    
l_regression = LogisticRegression(C=bestC, penalty=bestP)
l_regression.fit(XTrain, yTrain)
l_prediction = l_regression.predict(XTest)

print metrics.classification_report(yTest, l_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, l_prediction),2)

6) Neural Networks



In [ ]:

    
from multilayer_perceptron import multilayer_perceptron



In [ ]:

    
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(activation='logistic', 
                                                            hidden_layer_sizes=2, learning_rate_init=.5)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)

print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)



In [ ]:

    
# Check the arguments of the function
help(matplotlib_visplots.nnDecisionPlot)

matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, 2, .5)
matplotlib_visplots.nnDecisionPlot(XTrain, yTrain, XTest, yTest, (2,3,6), .5)

Tuning Neural Nets



In [ ]:

    
# Define the parameters to be optimised and their values/ranges
# Range for gamma and Cost hyperparameters
layer_size_range = [(3,2),(10,10),(2,2,2),10,5] # different networks shapes
learning_rate_range = np.linspace(.1,1,3)

parameters = [{'hidden_layer_sizes': layer_size_range, 'learning_rate_init': learning_rate_range}]

grid = GridSearchCV(multilayer_perceptron.MultilayerPerceptronClassifier(), parameters, cv= 10)
grid.fit(XTrain, yTrain)

best_size    = grid.best_params_['hidden_layer_sizes']
best_best_lr = grid.best_params_['learning_rate_init']
print "The best parameters are: hidden_layer_sizes=", best_size, " and learning_rate_init=", best_best_lr



In [ ]:

    
scores = [x[1] for x in grid.grid_scores_]
scores = np.array(scores).reshape(len(layer_size_range), len(learning_rate_range))
scores = np.transpose(scores)

plt.figure(figsize=(12, 6))
plt.imshow(scores, interpolation='nearest', origin='higher', cmap=plt.cm.get_cmap('jet_r'))
plt.xticks(np.arange(len(layer_size_range)), layer_size_range)
plt.yticks(np.arange(len(learning_rate_range)), learning_rate_range)
plt.xlabel('hidden layer topology')
plt.ylabel('learning rate')

cbar = plt.colorbar()
cbar.set_label('Classification Accuracy', rotation=270, labelpad=20)

plt.show()



In [ ]:

    
nnet = multilayer_perceptron.MultilayerPerceptronClassifier(hidden_layer_sizes=best_size, learning_rate_init=best_best_lr)
nnet.fit(XTrain, yTrain)
net_prediction = nnet.predict(XTest)

print metrics.classification_report(yTest, net_prediction)
print "Overall Accuracy:", round(metrics.accuracy_score(yTest, net_prediction),2)



In [ ]:

	email_id	is_spam	word_freq_will	word_freq_mail	word_freq_internet	...	word_freq_receive	capital_run_length_average	word_freq_address	word_freq_george	char_freq_(
0	3628	no	0.00	0.00	0.0	...	0.00	2.000	0.00	0.00	0.000
1	63	no	0.00	0.49	0.0	...	0.00	2.824	0.00	0.99	0.062
2	1540	no	1.31	0.00	0.0	...	0.00	2.176	0.00	0.00	0.431
3	4460	yes	0.75	0.50	0.5	...	0.25	1.023	0.75	0.00	0.180
4	2771	no	0.00	0.00	0.0	...	0.00	1.500	0.00	1.56	0.180

	word_freq_will	word_freq_original	word_freq_415	word_freq_mail	char_freq_#	char_freq_$	word_freq_internet	word_freq_edu	word_freq_hp	word_freq_lab	...	word_freq_receive	word_freq_000	capital_run_length_average	word_freq_address	word_freq_george	word_freq_cs	word_freq_random	word_freq_conference	word_freq_technology	char_freq_(
count	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000.00000	1000.000000	1000.000000	...	1000.000000	1000.000000	1000.000000	1000.000000	1000.000000	1000	1000	1000.000000	1000.000000	1000.000000
mean	0.537950	0.038370	0.054690	0.189840	0.022792	0.066014	0.073210	0.18100	0.611970	0.118610	...	0.051040	0.081300	4.857610	0.149980	0.775740	0	0	0.036690	0.125580	0.144783
std	0.831747	0.173041	0.365678	0.496022	0.109007	0.248239	0.270431	0.86285	1.734907	0.746169	...	0.192314	0.358906	30.226395	0.955315	3.509211	0	0	0.268434	0.449092	0.232423
min	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	...	0.000000	0.000000	1.000000	0.000000	0.000000	0	0	0.000000	0.000000	0.000000
25%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	...	0.000000	0.000000	1.541000	0.000000	0.000000	0	0	0.000000	0.000000	0.000000
50%	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.000000	0.00000	0.000000	0.000000	...	0.000000	0.000000	2.219500	0.000000	0.000000	0	0	0.000000	0.000000	0.072000
75%	0.820000	0.000000	0.000000	0.000000	0.000000	0.016000	0.000000	0.00000	0.315000	0.000000	...	0.000000	0.000000	3.396500	0.000000	0.000000	0	0	0.000000	0.000000	0.195000
max	6.250000	2.220000	4.760000	5.260000	1.410000	4.017000	3.570000	10.00000	20.830000	14.280000	...	2.000000	5.450000	667.000000	14.280000	33.330000	0	0	5.000000	4.760000	2.941000