Deep Learning Intro


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

Shallow and Deep Networks


In [2]:
from sklearn.datasets import make_moons

X, y = make_moons(n_samples = 1000, noise=0.1, random_state = 0)
plt.plot(X[y == 0, 0], X[y == 0, 1], 'ob', alpha = 0.5)
plt.plot(X[y == 1, 0], X[y == 1, 1], 'xr', alpha = 0.5)
plt.legend(['0', '1'])


Out[2]:
<matplotlib.legend.Legend at 0x7f3208f5ee10>

In [3]:
X.shape


Out[3]:
(1000, 2)

In [4]:
from sklearn.model_selection import train_test_split

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size = 0.3,
                                                    random_state = 42)

In [6]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam


Using TensorFlow backend.

Shallow Model


In [7]:
model = Sequential()
model.add(Dense(1, 
                input_shape=(2,), 
                activation='sigmoid'))
model.compile(Adam(lr = 0.05), 
              loss = 'binary_crossentropy', 
              metrics = ['accuracy'])

In [8]:
model.fit(X_train, y_train, epochs = 200, verbose=0)


Out[8]:
<keras.callbacks.History at 0x7f31fac1b9b0>

In [9]:
results = model.evaluate(X_test, y_test)


 32/300 [==>...........................] - ETA: 0s

In [10]:
results


Out[10]:
[0.31613724688688916, 0.84333333412806188]

In [11]:
print("The Accuracy score on the Train set is:\t{:0.3f}".format(results[1]))


The Accuracy score on the Train set is:	0.843

In [12]:
def plot_decision_boundary(model, X, y):
    amin, bmin = X.min(axis = 0) - 0.1
    amax, bmax = X.max(axis = 0) + 0.1
    hticks = np.linspace(amin, amax, 101)
    vticks = np.linspace(bmin, bmax, 101)
    
    aa, bb = np.meshgrid(hticks, vticks)
    ab = np.c_[aa.ravel(), bb.ravel()]
    
    c = model.predict(ab)
    cc = c.reshape(aa.shape)

    plt.figure(figsize = (12, 8))
    plt.contourf(aa, bb, cc, cmap = 'bwr', alpha = 0.2)
    plt.plot(X[y == 0, 0], X[y == 0, 1], 'ob', alpha = 0.5)
    plt.plot(X[y == 1, 0], X[y == 1, 1], 'xr', alpha = 0.5)
    plt.legend(['0', '1'])
    
plot_decision_boundary(model, X, y)


Deep model


In [13]:
model = Sequential()
model.add(Dense(4, 
                input_shape=(2,), 
                activation = 'tanh'))
model.add(Dense(2, 
                activation = 'tanh'))
model.add(Dense(1, 
                activation='sigmoid'))
model.compile(Adam(lr = 0.05), 
              loss = 'binary_crossentropy', 
              metrics=['accuracy'])

In [14]:
model.fit(X_train, y_train, epochs = 100, verbose=0)


Out[14]:
<keras.callbacks.History at 0x7f31f816f390>

In [15]:
model.evaluate(X_test, y_test)


 32/300 [==>...........................] - ETA: 0s
Out[15]:
[0.0023767850548028946, 1.0]

In [16]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [17]:
y_train_pred = model.predict_classes(X_train)
y_test_pred = model.predict_classes(X_test)

print("The Accuracy score on the Train set is:\t{:0.3f}".format(accuracy_score(y_train, y_train_pred)))
print("The Accuracy score on the Test set is:\t{:0.3f}".format(accuracy_score(y_test, y_test_pred)))


 32/300 [==>...........................] - ETA: 0sThe Accuracy score on the Train set is:	0.999
The Accuracy score on the Test set is:	1.000

In [18]:
plot_decision_boundary(model, X, y)


Multiclass classification

The Iris dataset


In [19]:
df = pd.read_csv('./data/iris.csv')

In [20]:
import notebook
import seaborn as sns
sns.pairplot(df, 
             hue = "species")


/home/arcyfelix/.local/lib/python3.5/site-packages/IPython/html.py:14: ShimWarning: The `IPython.html` package has been deprecated. You should import from `notebook` instead. `IPython.html.widgets` has moved to `ipywidgets`.
  "`IPython.html.widgets` has moved to `ipywidgets`.", ShimWarning)
Out[20]:
<seaborn.axisgrid.PairGrid at 0x7f31f802b080>

In [21]:
df.head()


Out[21]:
sepal_length sepal_width petal_length petal_width species
0 5.1 3.5 1.4 0.2 setosa
1 4.9 3.0 1.4 0.2 setosa
2 4.7 3.2 1.3 0.2 setosa
3 4.6 3.1 1.5 0.2 setosa
4 5.0 3.6 1.4 0.2 setosa

In [22]:
X = df.drop('species', axis = 1)
X.head()


Out[22]:
sepal_length sepal_width petal_length petal_width
0 5.1 3.5 1.4 0.2
1 4.9 3.0 1.4 0.2
2 4.7 3.2 1.3 0.2
3 4.6 3.1 1.5 0.2
4 5.0 3.6 1.4 0.2

In [23]:
target_names = df['species'].unique()
target_names


Out[23]:
array(['setosa', 'versicolor', 'virginica'], dtype=object)

In [24]:
target_dict = {n:i for i, n in enumerate(target_names)}
target_dict


Out[24]:
{'setosa': 0, 'versicolor': 1, 'virginica': 2}

In [25]:
y = df['species'].map(target_dict)
y.head()


Out[25]:
0    0
1    0
2    0
3    0
4    0
Name: species, dtype: int64

In [26]:
from keras.utils.np_utils import to_categorical

In [27]:
y_cat = to_categorical(y)

In [28]:
y_cat[:10]


Out[28]:
array([[ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.],
       [ 1.,  0.,  0.]])

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X.values, 
                                                    y_cat,
                                                    test_size = 0.2, 
                                                    random_state = 42)

In [30]:
model = Sequential()
model.add(Dense(3, 
                input_shape = (4,), 
                activation = 'softmax'))
model.compile(Adam(lr = 0.1),
              loss = 'categorical_crossentropy',
              metrics = ['accuracy'])

In [31]:
model.fit(X_train, 
          y_train, 
          epochs = 20, 
          validation_split = 0.1)


Train on 108 samples, validate on 12 samples
Epoch 1/20
108/108 [==============================] - 0s - loss: 0.9767 - acc: 0.7037 - val_loss: 0.7476 - val_acc: 0.5833
Epoch 2/20
108/108 [==============================] - 0s - loss: 0.6648 - acc: 0.6667 - val_loss: 0.8067 - val_acc: 0.5833
Epoch 3/20
108/108 [==============================] - 0s - loss: 0.5620 - acc: 0.7037 - val_loss: 0.7076 - val_acc: 0.5833
Epoch 4/20
108/108 [==============================] - 0s - loss: 0.5162 - acc: 0.6852 - val_loss: 0.5581 - val_acc: 0.5833
Epoch 5/20
108/108 [==============================] - 0s - loss: 0.4175 - acc: 0.7593 - val_loss: 0.5114 - val_acc: 0.9167
Epoch 6/20
108/108 [==============================] - 0s - loss: 0.4116 - acc: 0.8148 - val_loss: 0.4839 - val_acc: 0.9167
Epoch 7/20
108/108 [==============================] - 0s - loss: 0.3703 - acc: 0.8241 - val_loss: 0.4842 - val_acc: 0.5833
Epoch 8/20
108/108 [==============================] - 0s - loss: 0.3628 - acc: 0.8241 - val_loss: 0.4541 - val_acc: 0.9167
Epoch 9/20
108/108 [==============================] - 0s - loss: 0.3408 - acc: 0.9074 - val_loss: 0.4197 - val_acc: 0.9167
Epoch 10/20
108/108 [==============================] - 0s - loss: 0.3305 - acc: 0.8426 - val_loss: 0.4019 - val_acc: 0.9167
Epoch 11/20
108/108 [==============================] - 0s - loss: 0.2979 - acc: 0.9444 - val_loss: 0.4193 - val_acc: 0.7500
Epoch 12/20
108/108 [==============================] - 0s - loss: 0.3074 - acc: 0.9074 - val_loss: 0.3627 - val_acc: 0.9167
Epoch 13/20
108/108 [==============================] - 0s - loss: 0.3059 - acc: 0.8611 - val_loss: 0.3377 - val_acc: 1.0000
Epoch 14/20
108/108 [==============================] - 0s - loss: 0.2809 - acc: 0.9259 - val_loss: 0.3562 - val_acc: 0.9167
Epoch 15/20
108/108 [==============================] - 0s - loss: 0.2813 - acc: 0.9444 - val_loss: 0.3217 - val_acc: 0.9167
Epoch 16/20
108/108 [==============================] - 0s - loss: 0.2535 - acc: 0.9352 - val_loss: 0.3602 - val_acc: 0.8333
Epoch 17/20
108/108 [==============================] - 0s - loss: 0.2461 - acc: 0.9352 - val_loss: 0.2898 - val_acc: 0.9167
Epoch 18/20
108/108 [==============================] - 0s - loss: 0.2485 - acc: 0.9444 - val_loss: 0.2692 - val_acc: 1.0000
Epoch 19/20
108/108 [==============================] - 0s - loss: 0.2422 - acc: 0.9259 - val_loss: 0.2803 - val_acc: 1.0000
Epoch 20/20
108/108 [==============================] - 0s - loss: 0.2169 - acc: 0.9630 - val_loss: 0.2601 - val_acc: 0.9167
Out[31]:
<keras.callbacks.History at 0x7f31cf40c9e8>

In [32]:
y_pred = model.predict(X_test)

In [33]:
y_pred[:5]


Out[33]:
array([[  3.25250672e-03,   6.03683591e-01,   3.93063873e-01],
       [  9.93527591e-01,   6.46749511e-03,   4.85106739e-06],
       [  1.03202616e-07,   1.51174664e-02,   9.84882414e-01],
       [  3.78669961e-03,   5.20649433e-01,   4.75563884e-01],
       [  2.11044005e-03,   5.44582248e-01,   4.53307271e-01]], dtype=float32)

In [34]:
y_test_class = np.argmax(y_test, axis = 1)
y_pred_class = np.argmax(y_pred, axis = 1)

In [35]:
from sklearn.metrics import classification_report

In [36]:
print(classification_report(y_test_class, y_pred_class))


             precision    recall  f1-score   support

          0       1.00      1.00      1.00        10
          1       1.00      0.89      0.94         9
          2       0.92      1.00      0.96        11

avg / total       0.97      0.97      0.97        30


In [37]:
confusion_matrix(y_test_class, y_pred_class)


Out[37]:
array([[10,  0,  0],
       [ 0,  8,  1],
       [ 0,  0, 11]])

Exercise 1

The Pima Indians dataset is a very famous dataset distributed by UCI and originally collected from the National Institute of Diabetes and Digestive and Kidney Diseases. It contains data from clinical exams for women age 21 and above of Pima indian origins. The objective is to predict based on diagnostic measurements whether a patient has diabetes.

It has the following features:

  • Pregnancies: Number of times pregnant
  • Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test
  • BloodPressure: Diastolic blood pressure (mm Hg)
  • SkinThickness: Triceps skin fold thickness (mm)
  • Insulin: 2-Hour serum insulin (mu U/ml)
  • BMI: Body mass index (weight in kg/(height in m)^2)
  • DiabetesPedigreeFunction: Diabetes pedigree function
  • Age: Age (years)

The last colum is the outcome, and it is a binary variable.

In this first exercise we will explore it through the following steps:

  1. Load the ..data/diabetes.csv dataset, use pandas to explore the range of each feature
  • For each feature draw a histogram. Bonus points if you draw all the histograms in the same figure.
  • Explore correlations of features with the outcome column. You can do this in several ways, for example using the sns.pairplot we used above or drawing a heatmap of the correlations.
  • Do features need standardization? If so what stardardization technique will you use? MinMax? Standard?
  • Prepare your final X and y variables to be used by a ML model. Make sure you define your target variable well. Will you need dummy columns?

In [38]:
ex1 = pd.read_csv('./data/diabetes.csv')

In [39]:
ex1.hist(figsize = (12, 7))
plt.tight_layout()



In [40]:
import seaborn as sns

In [41]:
sns.pairplot(ex1, 
             hue = "Outcome")


Out[41]:
<seaborn.axisgrid.PairGrid at 0x7f31cfe0cfd0>

In [42]:
ex1.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 768 entries, 0 to 767
Data columns (total 9 columns):
Pregnancies                 768 non-null int64
Glucose                     768 non-null int64
BloodPressure               768 non-null int64
SkinThickness               768 non-null int64
Insulin                     768 non-null int64
BMI                         768 non-null float64
DiabetesPedigreeFunction    768 non-null float64
Age                         768 non-null int64
Outcome                     768 non-null int64
dtypes: float64(2), int64(7)
memory usage: 60.0 KB

In [43]:
ex1.describe()


Out[43]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
count 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000 768.000000
mean 3.845052 120.894531 69.105469 20.536458 79.799479 31.992578 0.471876 33.240885 0.348958
std 3.369578 31.972618 19.355807 15.952218 115.244002 7.884160 0.331329 11.760232 0.476951
min 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.078000 21.000000 0.000000
25% 1.000000 99.000000 62.000000 0.000000 0.000000 27.300000 0.243750 24.000000 0.000000
50% 3.000000 117.000000 72.000000 23.000000 30.500000 32.000000 0.372500 29.000000 0.000000
75% 6.000000 140.250000 80.000000 32.000000 127.250000 36.600000 0.626250 41.000000 1.000000
max 17.000000 199.000000 122.000000 99.000000 846.000000 67.100000 2.420000 81.000000 1.000000

In [44]:
from sklearn.preprocessing import MinMaxScaler

In [45]:
minmax = MinMaxScaler()

In [46]:
X = ex1.drop('Outcome', axis = 1)

In [47]:
Y = ex1['Outcome']

In [48]:
X.head()


Out[48]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
0 6 148 72 35 0 33.6 0.627 50
1 1 85 66 29 0 26.6 0.351 31
2 8 183 64 0 0 23.3 0.672 32
3 1 89 66 23 94 28.1 0.167 21
4 0 137 40 35 168 43.1 2.288 33

In [49]:
X = pd.DataFrame(minmax.fit_transform(X), 
                 columns = ['Pregnancies', 'Glucose', 'BloodPressure', 
                            'SkinThickness','Insulin', 'BMI', 
                            'DiabetesPedigreeFunction', 'Age'])

In [50]:
X.head()


Out[50]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age
0 0.352941 0.743719 0.590164 0.353535 0.000000 0.500745 0.234415 0.483333
1 0.058824 0.427136 0.540984 0.292929 0.000000 0.396423 0.116567 0.166667
2 0.470588 0.919598 0.524590 0.000000 0.000000 0.347243 0.253629 0.183333
3 0.058824 0.447236 0.540984 0.232323 0.111111 0.418778 0.038002 0.000000
4 0.000000 0.688442 0.327869 0.353535 0.198582 0.642325 0.943638 0.200000

In [51]:
Y.head()


Out[51]:
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [52]:
X = X.values
Y = Y.values

In [53]:
from keras.utils import to_categorical

In [54]:
Y_cat = to_categorical(Y)

In [55]:
Y_cat


Out[55]:
array([[ 0.,  1.],
       [ 1.,  0.],
       [ 0.,  1.],
       ..., 
       [ 1.,  0.],
       [ 0.,  1.],
       [ 1.,  0.]])

Exercise 2

Build a fully connected NN model that predicts diabetes. Follow these steps:

  1. Split your data in a train/test with a test size of 20% and a random_state = 22
  • define a sequential model with at least one inner layer. You will have to make choices for the following things:
    • what is the size of the input?
    • how many nodes will you use in each layer?
    • what is the size of the output?
    • what activation functions will you use in the inner layers?
    • what activation function will you use at output?
    • what loss function will you use?
    • what optimizer will you use?
  • fit your model on the training set, using a validation_split of 0.1
  • test your trained model on the test data from the train/test split
  • check the accuracy score, the confusion matrix and the classification report

In [56]:
from sklearn.model_selection import train_test_split

In [57]:
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y_cat, 
                                                    test_size = 0.2, 
                                                    random_state = 22)

In [58]:
X_train.shape


Out[58]:
(614, 8)

In [59]:
X_test.shape


Out[59]:
(154, 8)

In [60]:
Y_train.shape


Out[60]:
(614, 2)

In [61]:
Y_test.shape


Out[61]:
(154, 2)

In [62]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam

In [63]:
model = Sequential()
model.add(Dense(units = 64, 
                input_shape = (8, ), 
                activation = 'tanh'))
model.add(Dense(units = 32, 
                activation = 'tanh'))
model.add(Dense(units = 2, 
                activation = 'softmax'))

model.compile(optimizer = Adam(lr = 0.01), 
              loss = 'categorical_crossentropy', 
              metrics = ['accuracy'])

In [64]:
model.fit(X_train, 
          Y_train, 
          validation_split = 0.1, 
          epochs = 50, 
          verbose = 2)


Train on 552 samples, validate on 62 samples
Epoch 1/50
0s - loss: 0.6225 - acc: 0.6649 - val_loss: 0.5827 - val_acc: 0.7258
Epoch 2/50
0s - loss: 0.5543 - acc: 0.7210 - val_loss: 0.5359 - val_acc: 0.7097
Epoch 3/50
0s - loss: 0.5223 - acc: 0.7518 - val_loss: 0.6674 - val_acc: 0.6613
Epoch 4/50
0s - loss: 0.5390 - acc: 0.7283 - val_loss: 0.5103 - val_acc: 0.7097
Epoch 5/50
0s - loss: 0.4865 - acc: 0.7591 - val_loss: 0.4766 - val_acc: 0.7419
Epoch 6/50
0s - loss: 0.5093 - acc: 0.7591 - val_loss: 0.5236 - val_acc: 0.7581
Epoch 7/50
0s - loss: 0.4987 - acc: 0.7609 - val_loss: 0.4880 - val_acc: 0.7742
Epoch 8/50
0s - loss: 0.5215 - acc: 0.7301 - val_loss: 0.5358 - val_acc: 0.7419
Epoch 9/50
0s - loss: 0.4929 - acc: 0.7572 - val_loss: 0.4982 - val_acc: 0.7258
Epoch 10/50
0s - loss: 0.4764 - acc: 0.7826 - val_loss: 0.4947 - val_acc: 0.7581
Epoch 11/50
0s - loss: 0.4719 - acc: 0.7736 - val_loss: 0.5174 - val_acc: 0.7742
Epoch 12/50
0s - loss: 0.4766 - acc: 0.7808 - val_loss: 0.5069 - val_acc: 0.7258
Epoch 13/50
0s - loss: 0.4680 - acc: 0.7862 - val_loss: 0.5308 - val_acc: 0.7581
Epoch 14/50
0s - loss: 0.4745 - acc: 0.7736 - val_loss: 0.5162 - val_acc: 0.7258
Epoch 15/50
0s - loss: 0.4829 - acc: 0.7699 - val_loss: 0.5050 - val_acc: 0.7742
Epoch 16/50
0s - loss: 0.4682 - acc: 0.7717 - val_loss: 0.5016 - val_acc: 0.7742
Epoch 17/50
0s - loss: 0.4807 - acc: 0.7880 - val_loss: 0.5027 - val_acc: 0.7742
Epoch 18/50
0s - loss: 0.4691 - acc: 0.7808 - val_loss: 0.5285 - val_acc: 0.7742
Epoch 19/50
0s - loss: 0.4737 - acc: 0.7790 - val_loss: 0.4951 - val_acc: 0.7742
Epoch 20/50
0s - loss: 0.4849 - acc: 0.7681 - val_loss: 0.5102 - val_acc: 0.7097
Epoch 21/50
0s - loss: 0.4719 - acc: 0.7862 - val_loss: 0.4894 - val_acc: 0.7581
Epoch 22/50
0s - loss: 0.4773 - acc: 0.7681 - val_loss: 0.5346 - val_acc: 0.7742
Epoch 23/50
0s - loss: 0.4729 - acc: 0.7790 - val_loss: 0.4971 - val_acc: 0.7903
Epoch 24/50
0s - loss: 0.4611 - acc: 0.7808 - val_loss: 0.5265 - val_acc: 0.7903
Epoch 25/50
0s - loss: 0.4626 - acc: 0.7736 - val_loss: 0.4816 - val_acc: 0.7742
Epoch 26/50
0s - loss: 0.4482 - acc: 0.7989 - val_loss: 0.4850 - val_acc: 0.8065
Epoch 27/50
0s - loss: 0.4579 - acc: 0.7880 - val_loss: 0.5152 - val_acc: 0.7903
Epoch 28/50
0s - loss: 0.4540 - acc: 0.7862 - val_loss: 0.4754 - val_acc: 0.7903
Epoch 29/50
0s - loss: 0.4549 - acc: 0.7754 - val_loss: 0.4711 - val_acc: 0.8065
Epoch 30/50
0s - loss: 0.4650 - acc: 0.7754 - val_loss: 0.5301 - val_acc: 0.7258
Epoch 31/50
0s - loss: 0.4635 - acc: 0.7482 - val_loss: 0.4639 - val_acc: 0.7903
Epoch 32/50
0s - loss: 0.4514 - acc: 0.7808 - val_loss: 0.4670 - val_acc: 0.8065
Epoch 33/50
0s - loss: 0.4591 - acc: 0.7844 - val_loss: 0.5068 - val_acc: 0.7903
Epoch 34/50
0s - loss: 0.4689 - acc: 0.7754 - val_loss: 0.5378 - val_acc: 0.7742
Epoch 35/50
0s - loss: 0.4463 - acc: 0.7808 - val_loss: 0.4725 - val_acc: 0.8226
Epoch 36/50
0s - loss: 0.4427 - acc: 0.7826 - val_loss: 0.5114 - val_acc: 0.7742
Epoch 37/50
0s - loss: 0.4643 - acc: 0.7844 - val_loss: 0.4724 - val_acc: 0.7742
Epoch 38/50
0s - loss: 0.4523 - acc: 0.7989 - val_loss: 0.4736 - val_acc: 0.7742
Epoch 39/50
0s - loss: 0.4713 - acc: 0.7663 - val_loss: 0.4661 - val_acc: 0.7903
Epoch 40/50
0s - loss: 0.4615 - acc: 0.7772 - val_loss: 0.4825 - val_acc: 0.7903
Epoch 41/50
0s - loss: 0.4417 - acc: 0.7826 - val_loss: 0.4403 - val_acc: 0.8065
Epoch 42/50
0s - loss: 0.4570 - acc: 0.7862 - val_loss: 0.5056 - val_acc: 0.7742
Epoch 43/50
0s - loss: 0.4621 - acc: 0.7808 - val_loss: 0.5158 - val_acc: 0.7903
Epoch 44/50
0s - loss: 0.4536 - acc: 0.7736 - val_loss: 0.4661 - val_acc: 0.7903
Epoch 45/50
0s - loss: 0.4449 - acc: 0.7826 - val_loss: 0.4723 - val_acc: 0.7903
Epoch 46/50
0s - loss: 0.4383 - acc: 0.7971 - val_loss: 0.5140 - val_acc: 0.7903
Epoch 47/50
0s - loss: 0.4515 - acc: 0.7736 - val_loss: 0.4844 - val_acc: 0.7742
Epoch 48/50
0s - loss: 0.4369 - acc: 0.7917 - val_loss: 0.4531 - val_acc: 0.7903
Epoch 49/50
0s - loss: 0.4436 - acc: 0.7899 - val_loss: 0.4761 - val_acc: 0.7742
Epoch 50/50
0s - loss: 0.4470 - acc: 0.7917 - val_loss: 0.5370 - val_acc: 0.7581
Out[64]:
<keras.callbacks.History at 0x7f31cd8f90f0>

In [65]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [66]:
Y_pred = model.predict(X_test)

In [67]:
Y_pred


Out[67]:
array([[ 0.69321513,  0.30678487],
       [ 0.96971548,  0.03028453],
       [ 0.60026515,  0.39973488],
       [ 0.71022725,  0.28977269],
       [ 0.95935559,  0.0406444 ],
       [ 0.83953601,  0.16046396],
       [ 0.91710025,  0.08289977],
       [ 0.90058094,  0.0994191 ],
       [ 0.8986693 ,  0.1013307 ],
       [ 0.98115957,  0.01884041],
       [ 0.93336481,  0.06663517],
       [ 0.77059811,  0.22940189],
       [ 0.96629304,  0.03370703],
       [ 0.36887372,  0.63112622],
       [ 0.95320195,  0.04679808],
       [ 0.82475483,  0.17524509],
       [ 0.97457594,  0.02542408],
       [ 0.73955768,  0.26044238],
       [ 0.64015061,  0.35984933],
       [ 0.84146076,  0.15853928],
       [ 0.90808642,  0.0919136 ],
       [ 0.89343238,  0.10656761],
       [ 0.48478565,  0.51521438],
       [ 0.27667168,  0.72332835],
       [ 0.95334917,  0.04665087],
       [ 0.60332954,  0.39667046],
       [ 0.9758839 ,  0.02411616],
       [ 0.94482136,  0.05517857],
       [ 0.90599465,  0.09400539],
       [ 0.22653127,  0.77346873],
       [ 0.93729925,  0.06270076],
       [ 0.60237187,  0.3976281 ],
       [ 0.89825439,  0.10174567],
       [ 0.78253925,  0.21746069],
       [ 0.93499118,  0.06500882],
       [ 0.96254694,  0.03745313],
       [ 0.60758901,  0.39241108],
       [ 0.90729594,  0.09270401],
       [ 0.93134058,  0.06865945],
       [ 0.96697545,  0.03302453],
       [ 0.21976261,  0.78023738],
       [ 0.97240168,  0.02759836],
       [ 0.88825488,  0.1117451 ],
       [ 0.97759038,  0.02240961],
       [ 0.86293334,  0.1370666 ],
       [ 0.18474555,  0.81525445],
       [ 0.96723747,  0.0327625 ],
       [ 0.81634802,  0.18365197],
       [ 0.43692359,  0.56307638],
       [ 0.96131909,  0.03868087],
       [ 0.84154046,  0.15845959],
       [ 0.96365988,  0.03634013],
       [ 0.9770537 ,  0.02294623],
       [ 0.98020089,  0.01979909],
       [ 0.88991535,  0.11008466],
       [ 0.95201439,  0.04798559],
       [ 0.93242252,  0.06757755],
       [ 0.15368278,  0.84631717],
       [ 0.95869654,  0.04130351],
       [ 0.97709912,  0.02290088],
       [ 0.9830032 ,  0.01699686],
       [ 0.96973217,  0.03026784],
       [ 0.65122116,  0.34877881],
       [ 0.95416433,  0.04583569],
       [ 0.794447  ,  0.20555303],
       [ 0.95168215,  0.04831785],
       [ 0.88129544,  0.11870459],
       [ 0.56398833,  0.4360117 ],
       [ 0.5529989 ,  0.44700116],
       [ 0.47083718,  0.52916282],
       [ 0.94784302,  0.05215693],
       [ 0.78524989,  0.21475014],
       [ 0.92858267,  0.07141727],
       [ 0.93859285,  0.06140712],
       [ 0.8108446 ,  0.18915544],
       [ 0.89687967,  0.10312031],
       [ 0.78333378,  0.21666621],
       [ 0.64949042,  0.35050952],
       [ 0.90972304,  0.090277  ],
       [ 0.98363471,  0.01636528],
       [ 0.55289859,  0.44710144],
       [ 0.76449317,  0.2355068 ],
       [ 0.8865757 ,  0.11342428],
       [ 0.95973766,  0.04026231],
       [ 0.72164947,  0.2783505 ],
       [ 0.96750689,  0.03249311],
       [ 0.96346176,  0.03653826],
       [ 0.98855644,  0.01144364],
       [ 0.9708311 ,  0.02916888],
       [ 0.97827631,  0.02172367],
       [ 0.23434974,  0.76565027],
       [ 0.85446703,  0.14553298],
       [ 0.96668935,  0.03331069],
       [ 0.96936911,  0.03063091],
       [ 0.7542367 ,  0.24576329],
       [ 0.98133391,  0.01866605],
       [ 0.9433561 ,  0.05664389],
       [ 0.90725279,  0.09274718],
       [ 0.95689797,  0.04310205],
       [ 0.81465399,  0.18534602],
       [ 0.95281434,  0.04718561],
       [ 0.47119179,  0.52880824],
       [ 0.8933261 ,  0.10667392],
       [ 0.89726228,  0.10273775],
       [ 0.69243574,  0.30756423],
       [ 0.76262182,  0.23737819],
       [ 0.36196771,  0.63803232],
       [ 0.71612674,  0.28387323],
       [ 0.92267972,  0.07732033],
       [ 0.94496292,  0.05503708],
       [ 0.95859027,  0.04140974],
       [ 0.51408803,  0.48591197],
       [ 0.76307815,  0.23692189],
       [ 0.97804898,  0.02195099],
       [ 0.935408  ,  0.06459204],
       [ 0.78453857,  0.21546148],
       [ 0.86113137,  0.13886867],
       [ 0.72091603,  0.27908391],
       [ 0.76663572,  0.23336425],
       [ 0.80692685,  0.19307315],
       [ 0.85012031,  0.14987969],
       [ 0.811297  ,  0.18870302],
       [ 0.97452706,  0.0254729 ],
       [ 0.96878034,  0.03121963],
       [ 0.65581119,  0.34418878],
       [ 0.89686471,  0.10313524],
       [ 0.75794047,  0.2420595 ],
       [ 0.98243481,  0.01756524],
       [ 0.95737249,  0.04262752],
       [ 0.91538006,  0.08461995],
       [ 0.41446161,  0.58553839],
       [ 0.76993906,  0.23006095],
       [ 0.97207671,  0.02792323],
       [ 0.98582274,  0.01417728],
       [ 0.89332086,  0.10667909],
       [ 0.95915025,  0.0408498 ],
       [ 0.96655065,  0.0334494 ],
       [ 0.91852868,  0.08147135],
       [ 0.64509565,  0.35490438],
       [ 0.89742082,  0.10257912],
       [ 0.75184715,  0.24815293],
       [ 0.94167197,  0.05832803],
       [ 0.80161494,  0.19838502],
       [ 0.82589859,  0.17410146],
       [ 0.70045322,  0.29954681],
       [ 0.98749638,  0.01250357],
       [ 0.63433129,  0.36566874],
       [ 0.80540121,  0.19459884],
       [ 0.10937587,  0.89062417],
       [ 0.75345683,  0.24654317],
       [ 0.98274213,  0.01725789],
       [ 0.69915473,  0.30084527],
       [ 0.76360607,  0.23639396],
       [ 0.565597  ,  0.434403  ]], dtype=float32)

In [68]:
Y_pred = np.argmax(Y_pred, axis = 1)
Y_test = np.argmax(Y_test, axis = 1)

In [69]:
pd.Series(Y_test).value_counts() / len(Y_test)


Out[69]:
0    0.649351
1    0.350649
dtype: float64

In [70]:
pd.Series(Y_pred).value_counts() / len(Y_test)


Out[70]:
0    0.909091
1    0.090909
dtype: float64

In [71]:
accuracy_score(Y_test, Y_pred)


Out[71]:
0.70129870129870131

In [72]:
print(classification_report(Y_test, Y_pred))


             precision    recall  f1-score   support

          0       0.69      0.97      0.81       100
          1       0.79      0.20      0.32        54

avg / total       0.73      0.70      0.64       154


In [73]:
confusion_matrix(Y_test, Y_pred)


Out[73]:
array([[97,  3],
       [43, 11]])

Exercise 3

Compare your work with the results presented in this notebook. Are your Neural Network results better or worse than the results obtained by traditional Machine Learning techniques?

  • Try training a Support Vector Machine or a Random Forest model on the exact same train/test split. Is the performance better or worse?
  • Try restricting your features to only 4 features like in the suggested notebook. How does model performance change?

Training a Support Vector Machine Classifier and a Random Forest Classifier


In [74]:
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size = 0.2, 
                                                    random_state = 22)

In [75]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [76]:
models = [SVC(), RandomForestClassifier()]
for model in models:
    model.fit(X_train, Y_train)
    print('*' * 50)
    print(model)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('Accuracy: {:0.3f}'.format(accuracy))
    print('Confusion matrix: \n', confusion_matrix(Y_test, Y_pred))


**************************************************
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy: 0.734
Confusion matrix: 
 [[96  4]
 [37 17]]
**************************************************
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy: 0.695
Confusion matrix: 
 [[91  9]
 [38 16]]

Restricting to only 4 most important features


In [77]:
ex1.head()


Out[77]:
Pregnancies Glucose BloodPressure SkinThickness Insulin BMI DiabetesPedigreeFunction Age Outcome
0 6 148 72 35 0 33.6 0.627 50 1
1 1 85 66 29 0 26.6 0.351 31 0
2 8 183 64 0 0 23.3 0.672 32 1
3 1 89 66 23 94 28.1 0.167 21 0
4 0 137 40 35 168 43.1 2.288 33 1

In [78]:
outcome_corr = ex1.corr()['Outcome']
outcome_corr


Out[78]:
Pregnancies                 0.221898
Glucose                     0.466581
BloodPressure               0.065068
SkinThickness               0.074752
Insulin                     0.130548
BMI                         0.292695
DiabetesPedigreeFunction    0.173844
Age                         0.238356
Outcome                     1.000000
Name: Outcome, dtype: float64

In [79]:
outcome_corr_sorted = outcome_corr.sort_values(ascending = False)
outcome_corr_sorted


Out[79]:
Outcome                     1.000000
Glucose                     0.466581
BMI                         0.292695
Age                         0.238356
Pregnancies                 0.221898
DiabetesPedigreeFunction    0.173844
Insulin                     0.130548
SkinThickness               0.074752
BloodPressure               0.065068
Name: Outcome, dtype: float64

Thus, 4 most important features are:

  • Glucose
  • BMI
  • Age
  • Pregnancies

In [80]:
outcome_corr_sorted.index[0]


Out[80]:
'Outcome'

In [81]:
chosen_features = outcome_corr_sorted.index[1:5]
chosen_features


Out[81]:
Index(['Glucose', 'BMI', 'Age', 'Pregnancies'], dtype='object')

In [82]:
X = ex1[chosen_features]
X = minmax.fit_transform(X)

In [83]:
pd.DataFrame(X, columns = chosen_features).head()


Out[83]:
Glucose BMI Age Pregnancies
0 0.743719 0.500745 0.483333 0.352941
1 0.427136 0.396423 0.166667 0.058824
2 0.919598 0.347243 0.183333 0.470588
3 0.447236 0.418778 0.000000 0.058824
4 0.688442 0.642325 0.200000 0.000000

In [84]:
Y = ex1['Outcome']

In [85]:
Y.head()


Out[85]:
0    1
1    0
2    1
3    0
4    1
Name: Outcome, dtype: int64

In [86]:
X_train, X_test, Y_train, Y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size = 0.2, 
                                                    random_state = 22)

In [87]:
models = [SVC(), RandomForestClassifier()]
for model in models:
    model.fit(X_train, Y_train)
    print('*' * 100)
    print(model)
    Y_pred = model.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    print('Accuracy: {:0.3f}'.format(accuracy))
    print('Confusion matrix: \n', confusion_matrix(Y_test, Y_pred))


****************************************************************************************************
SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)
Accuracy: 0.747
Confusion matrix: 
 [[92  8]
 [31 23]]
****************************************************************************************************
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
            verbose=0, warm_start=False)
Accuracy: 0.727
Confusion matrix: 
 [[90 10]
 [32 22]]

The SVM performes slightly better, however Random Forest Classifier is slightly worse... Hmm...