Linear Regression


In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('./data/weight-height.csv')

In [3]:
df.head()


Out[3]:
Gender Height Weight
0 Male 73.847017 241.893563
1 Male 68.781904 162.310473
2 Male 74.110105 212.740856
3 Male 71.730978 220.042470
4 Male 69.881796 206.349801

In [4]:
df.plot(kind = 'scatter',
        figsize = (7, 7),
        x = 'Height',
        y = 'Weight',
        title = 'Weight and Height in adults')


Out[4]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdf4d773780>

In [5]:
df.plot(kind = 'scatter',
        figsize = (7, 7),
        x = 'Height',
        y = 'Weight',
        title = 'Weight and Height in adults')

# Here we're plotting the red line 'by hand' with fixed values
# We'll try to learn this line with an algorithm below
plt.plot([55, 78], [75, 250], color='red', linewidth=3)


Out[5]:
[<matplotlib.lines.Line2D at 0x7fdf4d4c4e80>]

In [6]:
def line(x, w=0, b=0):
    return x * w + b

In [7]:
x = np.linspace(55, 80, 100)

In [8]:
x


Out[8]:
array([ 55.        ,  55.25252525,  55.50505051,  55.75757576,
        56.01010101,  56.26262626,  56.51515152,  56.76767677,
        57.02020202,  57.27272727,  57.52525253,  57.77777778,
        58.03030303,  58.28282828,  58.53535354,  58.78787879,
        59.04040404,  59.29292929,  59.54545455,  59.7979798 ,
        60.05050505,  60.3030303 ,  60.55555556,  60.80808081,
        61.06060606,  61.31313131,  61.56565657,  61.81818182,
        62.07070707,  62.32323232,  62.57575758,  62.82828283,
        63.08080808,  63.33333333,  63.58585859,  63.83838384,
        64.09090909,  64.34343434,  64.5959596 ,  64.84848485,
        65.1010101 ,  65.35353535,  65.60606061,  65.85858586,
        66.11111111,  66.36363636,  66.61616162,  66.86868687,
        67.12121212,  67.37373737,  67.62626263,  67.87878788,
        68.13131313,  68.38383838,  68.63636364,  68.88888889,
        69.14141414,  69.39393939,  69.64646465,  69.8989899 ,
        70.15151515,  70.4040404 ,  70.65656566,  70.90909091,
        71.16161616,  71.41414141,  71.66666667,  71.91919192,
        72.17171717,  72.42424242,  72.67676768,  72.92929293,
        73.18181818,  73.43434343,  73.68686869,  73.93939394,
        74.19191919,  74.44444444,  74.6969697 ,  74.94949495,
        75.2020202 ,  75.45454545,  75.70707071,  75.95959596,
        76.21212121,  76.46464646,  76.71717172,  76.96969697,
        77.22222222,  77.47474747,  77.72727273,  77.97979798,
        78.23232323,  78.48484848,  78.73737374,  78.98989899,
        79.24242424,  79.49494949,  79.74747475,  80.        ])

In [9]:
yhat = line(x, w = 0, b = 0)

In [10]:
yhat


Out[10]:
array([ 0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
        0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.])

In [11]:
df.plot(kind = 'scatter',
        figsize = (7, 7),
        x = 'Height',
        y = 'Weight',
        title = 'Weight and Height in adults')
plt.plot(x, yhat, color='red', linewidth=3)


Out[11]:
[<matplotlib.lines.Line2D at 0x7fdf49b53320>]

Cost Function


In [12]:
def mean_squared_error(y_true, y_pred):
    s = (y_true - y_pred) ** 2
    return s.mean()

In [13]:
X = df[['Height']].values
y_true = df['Weight'].values

In [14]:
y_true


Out[14]:
array([ 241.89356318,  162.31047252,  212.74085556, ...,  128.47531878,
        163.85246135,  113.64910268])

In [15]:
y_pred = line(X)

In [16]:
y_pred


Out[16]:
array([[ 0.],
       [ 0.],
       [ 0.],
       ..., 
       [ 0.],
       [ 0.],
       [ 0.]])

In [17]:
mean_squared_error(y_true, y_pred.ravel())


Out[17]:
27093.83757456157

Manual exploration of different values of W and b


In [18]:
plt.figure(figsize=(10, 5))

# we are going to draw 2 plots in the same figure
# first plot, data and a few lines
ax1 = plt.subplot(121)
df.plot(kind = 'scatter',
        x = 'Height',
        y = 'Weight',
        title = 'Weight and Height in adults', ax=ax1)

# let's explore the cost function for a few values of b between -100 and +150
bbs = np.array([-100, -50, 0, 50, 100, 150])
mses = []  # we will append the values of the cost here, for each line
for b in bbs:
    y_pred = line(X, w = 2, b = b)
    mse = mean_squared_error(y_true, y_pred)
    mses.append(mse)
    plt.plot(X, y_pred)

# second plot: Cost function
ax2 = plt.subplot(122)
plt.plot(bbs, mses, 'o-')
plt.title('Cost as a function of b')
plt.xlabel('b')


Out[18]:
<matplotlib.text.Text at 0x7fdf49a2f438>

Linear Regression with Keras


In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, SGD


Using TensorFlow backend.

In [20]:
model = Sequential()

In [21]:
model.add(Dense(1, input_shape=(1,)))

In [22]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_1 (Dense)              (None, 1)                 2         
=================================================================
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________

In [23]:
model.compile(Adam(lr = 0.8), loss = 'mean_squared_error')

In [24]:
model.fit(X, y_true, epochs = 40)


Epoch 1/40
10000/10000 [==============================] - 0s - loss: 930.7508        
Epoch 2/40
10000/10000 [==============================] - 0s - loss: 557.1471     
Epoch 3/40
10000/10000 [==============================] - 0s - loss: 512.3308     
Epoch 4/40
10000/10000 [==============================] - 0s - loss: 480.9471     
Epoch 5/40
10000/10000 [==============================] - 0s - loss: 425.9382     
Epoch 6/40
10000/10000 [==============================] - 0s - loss: 379.2186     
Epoch 7/40
10000/10000 [==============================] - 0s - loss: 358.1868     
Epoch 8/40
10000/10000 [==============================] - 0s - loss: 308.3074     
Epoch 9/40
10000/10000 [==============================] - 0s - loss: 296.8228     
Epoch 10/40
10000/10000 [==============================] - 0s - loss: 262.1658     
Epoch 11/40
10000/10000 [==============================] - 0s - loss: 251.7636     
Epoch 12/40
10000/10000 [==============================] - 0s - loss: 229.9584     
Epoch 13/40
10000/10000 [==============================] - 0s - loss: 225.2165     
Epoch 14/40
10000/10000 [==============================] - 0s - loss: 206.4181     
Epoch 15/40
10000/10000 [==============================] - 0s - loss: 216.6366     
Epoch 16/40
10000/10000 [==============================] - 0s - loss: 195.0012     
Epoch 17/40
10000/10000 [==============================] - 0s - loss: 183.3192     
Epoch 18/40
10000/10000 [==============================] - 0s - loss: 200.7255     
Epoch 19/40
10000/10000 [==============================] - 0s - loss: 185.1531     
Epoch 20/40
10000/10000 [==============================] - 0s - loss: 185.9111     
Epoch 21/40
10000/10000 [==============================] - 0s - loss: 178.0814     
Epoch 22/40
10000/10000 [==============================] - 0s - loss: 187.1244     
Epoch 23/40
10000/10000 [==============================] - 0s - loss: 175.2084     
Epoch 24/40
10000/10000 [==============================] - 0s - loss: 179.4172     
Epoch 25/40
10000/10000 [==============================] - 0s - loss: 185.9728     
Epoch 26/40
10000/10000 [==============================] - 0s - loss: 174.9497     
Epoch 27/40
10000/10000 [==============================] - 0s - loss: 173.8840     
Epoch 28/40
10000/10000 [==============================] - 0s - loss: 187.2916     
Epoch 29/40
10000/10000 [==============================] - 0s - loss: 177.3368     
Epoch 30/40
10000/10000 [==============================] - 0s - loss: 181.7620     
Epoch 31/40
10000/10000 [==============================] - 0s - loss: 184.2961     
Epoch 32/40
10000/10000 [==============================] - 0s - loss: 175.1786     
Epoch 33/40
10000/10000 [==============================] - 0s - loss: 175.1353     
Epoch 34/40
10000/10000 [==============================] - 0s - loss: 182.0800     
Epoch 35/40
10000/10000 [==============================] - 0s - loss: 175.2877     
Epoch 36/40
10000/10000 [==============================] - 0s - loss: 169.8961     
Epoch 37/40
10000/10000 [==============================] - 0s - loss: 179.9470     
Epoch 38/40
10000/10000 [==============================] - 0s - loss: 173.5737     
Epoch 39/40
10000/10000 [==============================] - 0s - loss: 173.9584     
Epoch 40/40
10000/10000 [==============================] - 0s - loss: 169.4149     
Out[24]:
<keras.callbacks.History at 0x7fdf383a9240>

In [25]:
y_pred = model.predict(X)

In [26]:
df.plot(kind = 'scatter',
        x = 'Height',
        y = 'Weight',
        title = 'Weight and Height in adults')
plt.plot(X, y_pred, color='red')


Out[26]:
[<matplotlib.lines.Line2D at 0x7fdf38352518>]

In [27]:
W, B = model.get_weights()

In [28]:
W


Out[28]:
array([[ 7.56767607]], dtype=float32)

In [29]:
B


Out[29]:
array([-348.63464355], dtype=float32)

Evaluating Model Performance


In [30]:
from sklearn.metrics import r2_score

In [31]:
print("The R2 score is {:0.3f}".format(r2_score(y_true, y_pred)))


The R2 score is 0.795

Train Test Split


In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y_true,
                                                    test_size=0.2)

In [34]:
len(X_train)


Out[34]:
8000

In [35]:
len(X_test)


Out[35]:
2000

In [36]:
W[0, 0] = 0.0
B[0] = 0.0
model.set_weights((W, B))

In [37]:
model.fit(X_train, y_train, epochs = 50, verbose = 0)


Out[37]:
<keras.callbacks.History at 0x7fdf331db6a0>

In [38]:
y_train_pred = model.predict(X_train).ravel()
y_test_pred = model.predict(X_test).ravel()

In [39]:
from sklearn.metrics import mean_squared_error as mse

In [40]:
print("The Mean Squared Error on the Train set is:\t{:0.1f}".format(mse(y_train, y_train_pred)))
print("The Mean Squared Error on the Test set is:\t{:0.1f}".format(mse(y_test, y_test_pred)))


The Mean Squared Error on the Train set is:	150.1
The Mean Squared Error on the Test set is:	151.5

In [41]:
print("The R2 score on the Train set is:\t{:0.3f}".format(r2_score(y_train, y_train_pred)))
print("The R2 score on the Test set is:\t{:0.3f}".format(r2_score(y_test, y_test_pred)))


The R2 score on the Train set is:	0.855
The R2 score on the Test set is:	0.850

Classification


In [42]:
df = pd.read_csv('./data/user_visit_duration.csv')

In [43]:
df.head()


Out[43]:
Time (min) Buy
0 2.000000 0
1 0.683333 0
2 3.216667 1
3 0.900000 0
4 1.533333 1

In [44]:
df.plot(kind = 'scatter', 
        x='Time (min)', 
        y='Buy')


Out[44]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdf33162ac8>

In [45]:
model = Sequential()
model.add(Dense(1, input_shape=(1,), activation='sigmoid'))

In [46]:
model.compile(SGD(lr = 0.5), 
              loss = 'binary_crossentropy', 
              metrics=['accuracy'])

In [47]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_2 (Dense)              (None, 1)                 2         
=================================================================
Total params: 2
Trainable params: 2
Non-trainable params: 0
_________________________________________________________________

In [48]:
X = df[['Time (min)']].values
y = df['Buy'].values

model.fit(X, y, epochs = 25)


Epoch 1/25
100/100 [==============================] - 0s - loss: 0.6217 - acc: 0.5900     
Epoch 2/25
100/100 [==============================] - 0s - loss: 0.5992 - acc: 0.6100     
Epoch 3/25
100/100 [==============================] - 0s - loss: 0.5674 - acc: 0.6300     
Epoch 4/25
100/100 [==============================] - 0s - loss: 0.5899 - acc: 0.7000     
Epoch 5/25
100/100 [==============================] - 0s - loss: 0.5203 - acc: 0.7900     
Epoch 6/25
100/100 [==============================] - 0s - loss: 0.5268 - acc: 0.7500     
Epoch 7/25
100/100 [==============================] - 0s - loss: 0.5106 - acc: 0.7500     
Epoch 8/25
100/100 [==============================] - 0s - loss: 0.5015 - acc: 0.7500     
Epoch 9/25
100/100 [==============================] - 0s - loss: 0.4711 - acc: 0.8000     
Epoch 10/25
100/100 [==============================] - 0s - loss: 0.4593 - acc: 0.7700     
Epoch 11/25
100/100 [==============================] - 0s - loss: 0.4486 - acc: 0.7900     
Epoch 12/25
100/100 [==============================] - 0s - loss: 0.4426 - acc: 0.7900     
Epoch 13/25
100/100 [==============================] - 0s - loss: 0.4406 - acc: 0.8200     
Epoch 14/25
100/100 [==============================] - 0s - loss: 0.4295 - acc: 0.8300     
Epoch 15/25
100/100 [==============================] - 0s - loss: 0.4271 - acc: 0.8000     
Epoch 16/25
100/100 [==============================] - 0s - loss: 0.4229 - acc: 0.8200     
Epoch 17/25
100/100 [==============================] - 0s - loss: 0.4187 - acc: 0.8300     
Epoch 18/25
100/100 [==============================] - 0s - loss: 0.4201 - acc: 0.8000     
Epoch 19/25
100/100 [==============================] - 0s - loss: 0.4170 - acc: 0.8000     
Epoch 20/25
100/100 [==============================] - 0s - loss: 0.4065 - acc: 0.8200     
Epoch 21/25
100/100 [==============================] - 0s - loss: 0.4050 - acc: 0.8300     
Epoch 22/25
100/100 [==============================] - 0s - loss: 0.4113 - acc: 0.8000     
Epoch 23/25
100/100 [==============================] - 0s - loss: 0.4076 - acc: 0.8200     
Epoch 24/25
100/100 [==============================] - 0s - loss: 0.4141 - acc: 0.8300     
Epoch 25/25
100/100 [==============================] - 0s - loss: 0.4119 - acc: 0.8400     
Out[48]:
<keras.callbacks.History at 0x7fdf322ac048>

In [49]:
ax = df.plot(kind='scatter', 
             x = 'Time (min)', 
             y ='Buy',
             title = 'Purchase behavior VS time spent on site')

temp = np.linspace(0, 4)
ax.plot(temp, model.predict(temp), color = 'orange')
plt.legend(['model', 'data'])


Out[49]:
<matplotlib.legend.Legend at 0x7fdf32211f98>

In [50]:
temp_class = model.predict(temp) > 0.5

In [51]:
ax = df.plot(kind = 'scatter', 
             x = 'Time (min)', 
             y = 'Buy',
             title = 'Purchase behavior VS time spent on site')

temp = np.linspace(0, 4)
ax.plot(temp, temp_class, color = 'orange')
plt.legend(['model', 'data'])


Out[51]:
<matplotlib.legend.Legend at 0x7fdf322701d0>

In [52]:
y_pred = model.predict(X)
y_class_pred = y_pred > 0.5

In [53]:
from sklearn.metrics import accuracy_score

In [54]:
print("The accuracy score is {:0.3f}".format(accuracy_score(y, y_class_pred)))


The accuracy score is 0.840

Train/Test split


In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [56]:
params = model.get_weights()
params = [np.zeros(w.shape) for w in params]
model.set_weights(params)

In [57]:
print("The accuracy score is {:0.3f}".format(accuracy_score(y, model.predict(X) > 0.5)))


The accuracy score is 0.500

In [58]:
model.fit(X_train, y_train, epochs = 25, verbose = 0)


Out[58]:
<keras.callbacks.History at 0x7fdf32396a20>

In [59]:
print("The train accuracy score is {:0.3f}".format(accuracy_score(y_train, model.predict(X_train) > 0.5)))
print("The test accuracy score is {:0.3f}".format(accuracy_score(y_test, model.predict(X_test) > 0.5)))


The train accuracy score is 0.800
The test accuracy score is 0.850

Cross Validation


In [60]:
from keras.wrappers.scikit_learn import KerasClassifier

In [61]:
def build_logistic_regression_model():
    model = Sequential()
    model.add(Dense(1, 
                    input_shape = (1,), 
                    activation = 'sigmoid'))
    model.compile(SGD(lr = 0.5),
                  loss = 'binary_crossentropy',
                  metrics = ['accuracy'])
    return model

In [62]:
model = KerasClassifier(build_fn = build_logistic_regression_model,
                        epochs = 25,
                        verbose = 0)

In [63]:
from sklearn.model_selection import cross_val_score, KFold

In [64]:
cv = KFold(3, shuffle = True)

In [65]:
scores = cross_val_score(model, X, y, cv = cv)

In [66]:
scores


Out[66]:
array([ 0.73529412,  0.75757576,  0.87878788])

In [67]:
print("The cross validation accuracy is {:0.4f} ± {:0.4f}".format(scores.mean(), scores.std()))


The cross validation accuracy is 0.7906 ± 0.0631

Confusion Matrix


In [68]:
from sklearn.metrics import confusion_matrix

In [69]:
confusion_matrix(y, y_class_pred)


Out[69]:
array([[41,  9],
       [ 7, 43]])

In [70]:
def pretty_confusion_matrix(y_true, y_pred, labels = ["False", "True"]):
    cm = confusion_matrix(y_true, y_pred)
    pred_labels = ['Predicted '+ l for l in labels]
    df = pd.DataFrame(cm, index = labels, columns = pred_labels)
    return df

In [71]:
pretty_confusion_matrix(y, y_class_pred, ['Not Buy', 'Buy'])


Out[71]:
Predicted Not Buy Predicted Buy
Not Buy 41 9
Buy 7 43

In [72]:
from sklearn.metrics import precision_score, recall_score, f1_score

In [73]:
print("Precision:\t{:0.3f}".format(precision_score(y, y_class_pred)))
print("Recall:  \t{:0.3f}".format(recall_score(y, y_class_pred)))
print("F1 Score:\t{:0.3f}".format(f1_score(y, y_class_pred)))


Precision:	0.827
Recall:  	0.860
F1 Score:	0.843

In [74]:
from sklearn.metrics import classification_report

In [75]:
print(classification_report(y, y_class_pred))


             precision    recall  f1-score   support

          0       0.85      0.82      0.84        50
          1       0.83      0.86      0.84        50

avg / total       0.84      0.84      0.84       100

Feature Preprocessing

Categorical Features


In [76]:
df = pd.read_csv('./data/weight-height.csv')
df.head()


Out[76]:
Gender Height Weight
0 Male 73.847017 241.893563
1 Male 68.781904 162.310473
2 Male 74.110105 212.740856
3 Male 71.730978 220.042470
4 Male 69.881796 206.349801

In [77]:
df['Gender'].unique()


Out[77]:
array(['Male', 'Female'], dtype=object)

In [78]:
pd.get_dummies(df['Gender'], prefix = 'Gender').head()


Out[78]:
Gender_Female Gender_Male
0 0 1
1 0 1
2 0 1
3 0 1
4 0 1

Feature Transformations

1) Rescale with fixed factor


In [79]:
df['Height (feet)'] = df['Height']/12.0
df['Weight (100 lbs)'] = df['Weight']/100.0

In [80]:
df.describe().round(2)


Out[80]:
Height Weight Height (feet) Weight (100 lbs)
count 10000.00 10000.00 10000.00 10000.00
mean 66.37 161.44 5.53 1.61
std 3.85 32.11 0.32 0.32
min 54.26 64.70 4.52 0.65
25% 63.51 135.82 5.29 1.36
50% 66.32 161.21 5.53 1.61
75% 69.17 187.17 5.76 1.87
max 79.00 269.99 6.58 2.70

MinMax normalization


In [81]:
from sklearn.preprocessing import MinMaxScaler

mms = MinMaxScaler()
df['Weight_mms'] = mms.fit_transform(df[['Weight']])
df['Height_mms'] = mms.fit_transform(df[['Height']])
df.describe().round(2)


Out[81]:
Height Weight Height (feet) Weight (100 lbs) Weight_mms Height_mms
count 10000.00 10000.00 10000.00 10000.00 10000.00 10000.00
mean 66.37 161.44 5.53 1.61 0.47 0.49
std 3.85 32.11 0.32 0.32 0.16 0.16
min 54.26 64.70 4.52 0.65 0.00 0.00
25% 63.51 135.82 5.29 1.36 0.35 0.37
50% 66.32 161.21 5.53 1.61 0.47 0.49
75% 69.17 187.17 5.76 1.87 0.60 0.60
max 79.00 269.99 6.58 2.70 1.00 1.00

3) Standard normalization


In [82]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
df['Weight_ss'] = ss.fit_transform(df[['Weight']])
df['Height_ss'] = ss.fit_transform(df[['Height']])
df.describe().round(2)


Out[82]:
Height Weight Height (feet) Weight (100 lbs) Weight_mms Height_mms Weight_ss Height_ss
count 10000.00 10000.00 10000.00 10000.00 10000.00 10000.00 10000.00 10000.00
mean 66.37 161.44 5.53 1.61 0.47 0.49 0.00 0.00
std 3.85 32.11 0.32 0.32 0.16 0.16 1.00 1.00
min 54.26 64.70 4.52 0.65 0.00 0.00 -3.01 -3.15
25% 63.51 135.82 5.29 1.36 0.35 0.37 -0.80 -0.74
50% 66.32 161.21 5.53 1.61 0.47 0.49 -0.01 -0.01
75% 69.17 187.17 5.76 1.87 0.60 0.60 0.80 0.73
max 79.00 269.99 6.58 2.70 1.00 1.00 3.38 3.28

In [83]:
plt.figure(figsize=(15, 5))

for i, feature in enumerate(['Height', 'Height (feet)', 'Height_mms', 'Height_ss']):
    plt.subplot(1, 4, i+1)
    df[feature].plot(kind = 'hist', 
                     title = feature)
    plt.xlabel(feature)


Machine Learning Exercises

Exercise 1

You've just been hired at a real estate investment firm and they would like you to build a model for pricing houses. You are given a dataset that contains data for house prices and a few features like number of bedrooms, size in square feet and age of the house. Let's see if you can build a model that is able to predict the price. In this exercise we extend what we have learned about linear regression to a dataset with more than one feature. Here are the steps to complete it:

  1. Load the dataset ../data/housing-data.csv
  • plot the histograms for each feature
  • create 2 variables called X and y: X shall be a matrix with 3 columns (sqft,bdrms,age) and y shall be a vector with 1 column (price)
  • create a linear regression model in Keras with the appropriate number of inputs and output
  • split the data into train and test with a 20% test size
  • train the model on the training set and check its accuracy on training and test set
  • how's your model doing? Is the loss growing smaller?
  • try to improve your model with these experiments:
    • normalize the input features with one of the rescaling techniques mentioned above
    • use a different value for the learning rate of your model
    • use a different optimizer
  • once you're satisfied with training, check the R2score on the test set

In [84]:
ex1 = pd.read_csv('./data/housing-data.csv')

In [85]:
ex1.head()


Out[85]:
sqft bdrms age price
0 2104 3 70 399900
1 1600 3 28 329900
2 2400 3 44 369000
3 1416 2 49 232000
4 3000 4 75 539900

In [86]:
ex1.shape


Out[86]:
(47, 4)

In [87]:
plt.figure(figsize=(20, 5))

for i, feature in enumerate(ex1.columns):
    plt.subplot(1, 4, i + 1)
    ex1[feature].plot(kind = 'hist',
                      title = feature)
    plt.xlabel(feature)
    plt.tight_layout()



In [88]:
X = ex1[['sqft', 'bdrms', 'age']]
X.head()


Out[88]:
sqft bdrms age
0 2104 3 70
1 1600 3 28
2 2400 3 44
3 1416 2 49
4 3000 4 75

In [89]:
Y = ex1[['price']]
Y.head()


Out[89]:
price
0 399900
1 329900
2 369000
3 232000
4 539900

In [90]:
X = X.values
Y = Y.values

In [91]:
X.shape


Out[91]:
(47, 3)

In [92]:
Y.shape


Out[92]:
(47, 1)

In [93]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam

In [94]:
model = Sequential()

In [95]:
model.add(Dense(1, input_shape = (3, )))

In [96]:
opt = Adam(lr = 0.8)

In [97]:
model.compile(optimizer = opt, loss = 'mean_squared_error')

In [98]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_6 (Dense)              (None, 1)                 4         
=================================================================
Total params: 4
Trainable params: 4
Non-trainable params: 0
_________________________________________________________________

In [99]:
from sklearn.model_selection import train_test_split

In [100]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [101]:
model.fit(X_train, Y_train, epochs = 10)


Epoch 1/10
37/37 [==============================] - 0s - loss: 126081951688.6487     
Epoch 2/10
37/37 [==============================] - 0s - loss: 123794320854.4865     
Epoch 3/10
37/37 [==============================] - 0s - loss: 121551407159.3513     
Epoch 4/10
37/37 [==============================] - 0s - loss: 119271369091.4594     
Epoch 5/10
37/37 [==============================] - 0s - loss: 116997365981.4054     
Epoch 6/10
37/37 [==============================] - 0s - loss: 114896879837.4054     
Epoch 7/10
37/37 [==============================] - 0s - loss: 112716553243.6757     
Epoch 8/10
37/37 [==============================] - 0s - loss: 110539034956.1081     
Epoch 9/10
37/37 [==============================] - 0s - loss: 108463120162.5946     
Epoch 10/10
37/37 [==============================] - 0s - loss: 106354297219.4594     
Out[101]:
<keras.callbacks.History at 0x7fdf3102e0b8>

In [102]:
from sklearn.metrics import r2_score

In [103]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)

train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)

print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))


Train set score: 	-6.572
Test set score: 	-4.996

MinMaxScaler


In [104]:
from sklearn.preprocessing import MinMaxScaler

In [105]:
minmax = MinMaxScaler()

In [106]:
X = minmax.fit_transform(X)
Y = minmax.fit_transform(Y)


/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, _DataConversionWarning)

In [107]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [108]:
model = Sequential()
model.add(Dense(1, input_shape = (3, )))
opt = Adam(lr = 0.8)
model.compile(optimizer = opt, loss = 'mean_squared_error')

In [109]:
model.fit(X_train, Y_train, epochs = 20)


Epoch 1/20
37/37 [==============================] - 0s - loss: 0.5628     
Epoch 2/20
37/37 [==============================] - 0s - loss: 0.2645     
Epoch 3/20
37/37 [==============================] - 0s - loss: 1.2662     
Epoch 4/20
37/37 [==============================] - 0s - loss: 0.2011     
Epoch 5/20
37/37 [==============================] - 0s - loss: 0.3728     
Epoch 6/20
37/37 [==============================] - 0s - loss: 0.3402     
Epoch 7/20
37/37 [==============================] - 0s - loss: 0.0775     
Epoch 8/20
37/37 [==============================] - 0s - loss: 0.3581     
Epoch 9/20
37/37 [==============================] - 0s - loss: 0.0748     
Epoch 10/20
37/37 [==============================] - 0s - loss: 0.1769     
Epoch 11/20
37/37 [==============================] - 0s - loss: 0.1618     
Epoch 12/20
37/37 [==============================] - 0s - loss: 0.0349     
Epoch 13/20
37/37 [==============================] - 0s - loss: 0.1677     
Epoch 14/20
37/37 [==============================] - 0s - loss: 0.0740     
Epoch 15/20
37/37 [==============================] - 0s - loss: 0.0468     
Epoch 16/20
37/37 [==============================] - 0s - loss: 0.0701     
Epoch 17/20
37/37 [==============================] - 0s - loss: 0.0217     
Epoch 18/20
37/37 [==============================] - 0s - loss: 0.0371     
Epoch 19/20
37/37 [==============================] - 0s - loss: 0.0165     
Epoch 20/20
37/37 [==============================] - 0s - loss: 0.0396     
Out[109]:
<keras.callbacks.History at 0x7fdf30aeb048>

In [110]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)

train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)

print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))


Train set score: 	0.636
Test set score: 	0.623

Lowering the learning rate


In [111]:
model = Sequential()
model.add(Dense(1, input_shape = (3, )))
opt = Adam(lr = 0.1)
model.compile(optimizer = opt, loss = 'mean_squared_error')

In [112]:
model.fit(X_train, Y_train, epochs = 20, verbose = 1)


Epoch 1/20
37/37 [==============================] - 0s - loss: 0.4131     
Epoch 2/20
37/37 [==============================] - 0s - loss: 0.0867     
Epoch 3/20
37/37 [==============================] - 0s - loss: 0.1544     
Epoch 4/20
37/37 [==============================] - 0s - loss: 0.2154     
Epoch 5/20
37/37 [==============================] - 0s - loss: 0.1397     
Epoch 6/20
37/37 [==============================] - 0s - loss: 0.0660     
Epoch 7/20
37/37 [==============================] - 0s - loss: 0.0632     
Epoch 8/20
37/37 [==============================] - 0s - loss: 0.0896     
Epoch 9/20
37/37 [==============================] - 0s - loss: 0.0862     
Epoch 10/20
37/37 [==============================] - 0s - loss: 0.0537     
Epoch 11/20
37/37 [==============================] - 0s - loss: 0.0310     
Epoch 12/20
37/37 [==============================] - 0s - loss: 0.0312     
Epoch 13/20
37/37 [==============================] - 0s - loss: 0.0438     
Epoch 14/20
37/37 [==============================] - 0s - loss: 0.0470     
Epoch 15/20
37/37 [==============================] - 0s - loss: 0.0349     
Epoch 16/20
37/37 [==============================] - 0s - loss: 0.0199     
Epoch 17/20
37/37 [==============================] - 0s - loss: 0.0188     
Epoch 18/20
37/37 [==============================] - 0s - loss: 0.0235     
Epoch 19/20
37/37 [==============================] - 0s - loss: 0.0210     
Epoch 20/20
37/37 [==============================] - 0s - loss: 0.0157     
Out[112]:
<keras.callbacks.History at 0x7fdf30def5f8>

In [113]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)

train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)

print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))


Train set score: 	0.724
Test set score: 	0.685

Using SGD


In [114]:
model = Sequential()
model.add(Dense(1, input_shape = (3, )))
opt = SGD(lr = 0.1)
model.compile(optimizer = opt, loss = 'mean_squared_error')

In [115]:
model.fit(X_train, Y_train, epochs = 20)


Epoch 1/20
37/37 [==============================] - 0s - loss: 0.1078     
Epoch 2/20
37/37 [==============================] - 0s - loss: 0.0953     
Epoch 3/20
37/37 [==============================] - 0s - loss: 0.0971     
Epoch 4/20
37/37 [==============================] - 0s - loss: 0.0920     
Epoch 5/20
37/37 [==============================] - 0s - loss: 0.0820     
Epoch 6/20
37/37 [==============================] - 0s - loss: 0.0793     
Epoch 7/20
37/37 [==============================] - 0s - loss: 0.0783     
Epoch 8/20
37/37 [==============================] - 0s - loss: 0.0756     
Epoch 9/20
37/37 [==============================] - 0s - loss: 0.0738     
Epoch 10/20
37/37 [==============================] - 0s - loss: 0.0710     
Epoch 11/20
37/37 [==============================] - 0s - loss: 0.0682     
Epoch 12/20
37/37 [==============================] - 0s - loss: 0.0676     
Epoch 13/20
37/37 [==============================] - 0s - loss: 0.0660     
Epoch 14/20
37/37 [==============================] - 0s - loss: 0.0627     
Epoch 15/20
37/37 [==============================] - 0s - loss: 0.0599     
Epoch 16/20
37/37 [==============================] - 0s - loss: 0.0572     
Epoch 17/20
37/37 [==============================] - 0s - loss: 0.0558     
Epoch 18/20
37/37 [==============================] - 0s - loss: 0.0557     
Epoch 19/20
37/37 [==============================] - 0s - loss: 0.0515     
Epoch 20/20
37/37 [==============================] - 0s - loss: 0.0518     
Out[115]:
<keras.callbacks.History at 0x7fdf3115ac88>

In [116]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)

train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)

print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))


Train set score: 	-0.024
Test set score: 	0.062

Exercise 2

Your boss was extremely happy with your work on the housing price prediction model and decided to entrust you with a more challenging task. They've seen a lot of people leave the company recently and they would like to understand why that's happening. They have collected historical data on employees and they would like you to build a model that is able to predict which employee will leave next. The would like a model that is better than random guessing. They also prefer false negatives than false positives, in this first phase. Fields in the dataset include:

  • Employee satisfaction level
  • Last evaluation
  • Number of projects
  • Average monthly hours
  • Time spent at the company
  • Whether they have had a work accident
  • Whether they have had a promotion in the last 5 years
  • Department
  • Salary
  • Whether the employee has left

Your goal is to predict the binary outcome variable left using the rest of the data. Since the outcome is binary, this is a classification problem. Here are some things you may want to try out:

  1. load the dataset at ../data/HR_comma_sep.csv, inspect it with .head(), .info() and .describe().
  • Establish a benchmark: what would be your accuracy score if you predicted everyone stay?
  • Check if any feature needs rescaling. You may plot a histogram of the feature to decide which rescaling method is more appropriate.
  • convert the categorical features into binary dummy columns. You will then have to combine them with the numerical features using pd.concat.
  • do the usual train/test split with a 20% test size
  • play around with learning rate and optimizer
  • check the confusion matrix, precision and recall
  • check if you still get the same results if you use a 5-Fold cross validation on all the data
  • Is the model good enough for your boss?

As you will see in this exercise, the a logistic regression model is not good enough to help your boss. In the next chapter we will learn how to go beyond linear models.

This dataset comes from https://www.kaggle.com/ludobenistant/hr-analytics/ and is released under CC BY-SA 4.0 License.


In [117]:
ex2 = pd.read_csv('./data/HR_comma_sep.csv')

In [118]:
ex2.head()


Out[118]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales salary
0 0.38 0.53 2 157 3 0 1 0 sales low
1 0.80 0.86 5 262 6 0 1 0 sales medium
2 0.11 0.88 7 272 4 0 1 0 sales medium
3 0.72 0.87 5 223 5 0 1 0 sales low
4 0.37 0.52 2 159 3 0 1 0 sales low

In [119]:
ex2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 14999 entries, 0 to 14998
Data columns (total 10 columns):
satisfaction_level       14999 non-null float64
last_evaluation          14999 non-null float64
number_project           14999 non-null int64
average_montly_hours     14999 non-null int64
time_spend_company       14999 non-null int64
Work_accident            14999 non-null int64
left                     14999 non-null int64
promotion_last_5years    14999 non-null int64
sales                    14999 non-null object
salary                   14999 non-null object
dtypes: float64(2), int64(6), object(2)
memory usage: 1.3+ MB

In [120]:
ex2.describe()


Out[120]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
count 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000 14999.000000
mean 0.612834 0.716102 3.803054 201.050337 3.498233 0.144610 0.238083 0.021268
std 0.248631 0.171169 1.232592 49.943099 1.460136 0.351719 0.425924 0.144281
min 0.090000 0.360000 2.000000 96.000000 2.000000 0.000000 0.000000 0.000000
25% 0.440000 0.560000 3.000000 156.000000 3.000000 0.000000 0.000000 0.000000
50% 0.640000 0.720000 4.000000 200.000000 3.000000 0.000000 0.000000 0.000000
75% 0.820000 0.870000 5.000000 245.000000 4.000000 0.000000 0.000000 0.000000
max 1.000000 1.000000 7.000000 310.000000 10.000000 1.000000 1.000000 1.000000

In [121]:
# Accuracy if predicted that all stay 
# left = 1 : the employee already left
acc = (1 - (ex2['left'].value_counts()[1] / ex2.shape[0])) * 100
print('Predicting all would stay yields accuracy: \t{:0.2f} %'.format(acc))


Predicting all would stay yields accuracy: 	76.19 %

In [122]:
len(ex2.columns)


Out[122]:
10

In [123]:
ex2['average_montly_hours'].plot(kind = 'hist', 
                                 figsize = (10, 5))


Out[123]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdf30e49e10>

In [124]:
ex2['time_spend_company'].plot(kind = 'hist', 
                                 figsize = (10, 5))


Out[124]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdf30a40e10>

In [125]:
minmax = MinMaxScaler()

In [126]:
ex2['average_montly_hours'] = minmax.fit_transform(ex2['average_montly_hours'].reshape(-1,1))
ex2['time_spend_company'] = minmax.fit_transform(ex2['time_spend_company'].reshape(-1,1))


/usr/local/lib/python3.5/dist-packages/sklearn/utils/validation.py:429: DataConversionWarning: Data with input dtype int64 was converted to float64 by MinMaxScaler.
  warnings.warn(msg, _DataConversionWarning)

In [127]:
ex2['average_montly_hours'].plot(kind = 'hist', 
                                 figsize = (10, 5))


Out[127]:
<matplotlib.axes._subplots.AxesSubplot at 0x7fdf31131da0>

In [128]:
# Sales and salary are categorical data
ex2_dummies = pd.get_dummies(ex2[['sales', 'salary']])

In [129]:
ex2_dummies.head()


Out[129]:
sales_IT sales_RandD sales_accounting sales_hr sales_management sales_marketing sales_product_mng sales_sales sales_support sales_technical salary_high salary_low salary_medium
0 0 0 0 0 0 0 0 1 0 0 0 1 0
1 0 0 0 0 0 0 0 1 0 0 0 0 1
2 0 0 0 0 0 0 0 1 0 0 0 0 1
3 0 0 0 0 0 0 0 1 0 0 0 1 0
4 0 0 0 0 0 0 0 1 0 0 0 1 0

In [130]:
del ex2['sales'], ex2['salary']

In [131]:
ex2.head()


Out[131]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years
0 0.38 0.53 2 0.285047 0.125 0 1 0
1 0.80 0.86 5 0.775701 0.500 0 1 0
2 0.11 0.88 7 0.822430 0.250 0 1 0
3 0.72 0.87 5 0.593458 0.375 0 1 0
4 0.37 0.52 2 0.294393 0.125 0 1 0

In [132]:
len(ex2.columns)


Out[132]:
8

In [133]:
data = pd.concat([ex2, ex2_dummies], axis = 1)

In [134]:
data.head()


Out[134]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident left promotion_last_5years sales_IT sales_RandD ... sales_hr sales_management sales_marketing sales_product_mng sales_sales sales_support sales_technical salary_high salary_low salary_medium
0 0.38 0.53 2 0.285047 0.125 0 1 0 0 0 ... 0 0 0 0 1 0 0 0 1 0
1 0.80 0.86 5 0.775701 0.500 0 1 0 0 0 ... 0 0 0 0 1 0 0 0 0 1
2 0.11 0.88 7 0.822430 0.250 0 1 0 0 0 ... 0 0 0 0 1 0 0 0 0 1
3 0.72 0.87 5 0.593458 0.375 0 1 0 0 0 ... 0 0 0 0 1 0 0 0 1 0
4 0.37 0.52 2 0.294393 0.125 0 1 0 0 0 ... 0 0 0 0 1 0 0 0 1 0

5 rows × 21 columns


In [135]:
X = data.ix[:, data.columns != 'left']

In [136]:
X.head()


Out[136]:
satisfaction_level last_evaluation number_project average_montly_hours time_spend_company Work_accident promotion_last_5years sales_IT sales_RandD sales_accounting sales_hr sales_management sales_marketing sales_product_mng sales_sales sales_support sales_technical salary_high salary_low salary_medium
0 0.38 0.53 2 0.285047 0.125 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
1 0.80 0.86 5 0.775701 0.500 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
2 0.11 0.88 7 0.822430 0.250 0 0 0 0 0 0 0 0 0 1 0 0 0 0 1
3 0.72 0.87 5 0.593458 0.375 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
4 0.37 0.52 2 0.294393 0.125 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0

In [137]:
len(X.columns)


Out[137]:
20

In [138]:
Y = data['left']

In [139]:
from sklearn.model_selection import train_test_split

In [140]:
X = X.values
Y = Y.values

In [141]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

In [142]:
model = Sequential()
model.add(Dense(1, input_dim = X_train.shape[1], activation = 'sigmoid'))
model.compile(optimizer = Adam(lr = 0.05), loss = 'binary_crossentropy', metrics = ['accuracy'])

In [143]:
model.summary()


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
=================================================================
dense_10 (Dense)             (None, 1)                 21        
=================================================================
Total params: 21
Trainable params: 21
Non-trainable params: 0
_________________________________________________________________

In [144]:
model.fit(X_train, Y_train, epochs = 20)


Epoch 1/20
11999/11999 [==============================] - 1s - loss: 0.4806 - acc: 0.7661      
Epoch 2/20
11999/11999 [==============================] - 1s - loss: 0.4333 - acc: 0.7847     
Epoch 3/20
11999/11999 [==============================] - 1s - loss: 0.4359 - acc: 0.7895     
Epoch 4/20
11999/11999 [==============================] - 1s - loss: 0.4328 - acc: 0.7936     
Epoch 5/20
11999/11999 [==============================] - 1s - loss: 0.4329 - acc: 0.7918     
Epoch 6/20
11999/11999 [==============================] - 1s - loss: 0.4331 - acc: 0.7901     
Epoch 7/20
11999/11999 [==============================] - 1s - loss: 0.4333 - acc: 0.7911     
Epoch 8/20
11999/11999 [==============================] - 1s - loss: 0.4324 - acc: 0.7911     
Epoch 9/20
11999/11999 [==============================] - 1s - loss: 0.4333 - acc: 0.7905     
Epoch 10/20
11999/11999 [==============================] - 1s - loss: 0.4312 - acc: 0.7966     
Epoch 11/20
11999/11999 [==============================] - 1s - loss: 0.4336 - acc: 0.7912     
Epoch 12/20
11999/11999 [==============================] - 1s - loss: 0.4351 - acc: 0.7850     
Epoch 13/20
11999/11999 [==============================] - 1s - loss: 0.4339 - acc: 0.7916     
Epoch 14/20
11999/11999 [==============================] - 0s - loss: 0.4330 - acc: 0.7964     
Epoch 15/20
11999/11999 [==============================] - 1s - loss: 0.4330 - acc: 0.7896     
Epoch 16/20
11999/11999 [==============================] - 1s - loss: 0.4333 - acc: 0.7928     
Epoch 17/20
11999/11999 [==============================] - 0s - loss: 0.4337 - acc: 0.7949     
Epoch 18/20
11999/11999 [==============================] - 0s - loss: 0.4370 - acc: 0.7907     
Epoch 19/20
11999/11999 [==============================] - 0s - loss: 0.4355 - acc: 0.7904     
Epoch 20/20
11999/11999 [==============================] - 1s - loss: 0.4334 - acc: 0.7971     
Out[144]:
<keras.callbacks.History at 0x7fdf30cafa58>

In [145]:
Y_test_predicted = model.predict_classes(X_test)


2304/3000 [======================>.......] - ETA: 0s 

In [146]:
from sklearn.metrics import confusion_matrix, classification_report

In [147]:
def pretty_confusion_matrix(y_true, y_pred, labels=["False", "True"]):
    cm = confusion_matrix(y_true, y_pred)
    pred_labels = ['Predicted '+ l for l in labels]
    df = pd.DataFrame(cm, index = labels, columns = pred_labels)
    return df

In [148]:
pretty_confusion_matrix(Y_test, Y_test_predicted, labels=['Stay', 'Leave'])


Out[148]:
Predicted Stay Predicted Leave
Stay 2075 197
Leave 483 245

In [149]:
print(classification_report(Y_test, Y_test_predicted))


             precision    recall  f1-score   support

          0       0.81      0.91      0.86      2272
          1       0.55      0.34      0.42       728

avg / total       0.75      0.77      0.75      3000


In [150]:
from sklearn.model_selection import cross_val_score, KFold

In [151]:
from keras.wrappers.scikit_learn import KerasClassifier

In [152]:
def build_model():
    model = Sequential()
    model.add(Dense(1, 
                    input_dim = 20, 
                    activation='sigmoid'))
    model.compile(Adam(lr = 0.1), 'binary_crossentropy', metrics=['accuracy'])
    return model

In [153]:
model = KerasClassifier(build_fn = build_model, 
                        epochs = 10, 
                        verbose = 0)

In [154]:
from sklearn.model_selection import KFold, cross_val_score

In [155]:
cross_val = KFold(5, shuffle = True)

In [156]:
scores = cross_val_score(model, X, Y, cv = cross_val)

In [157]:
print("The cross validation accuracy is {:0.4f} ± {:0.4f}".format(scores.mean(), scores.std()))


The cross validation accuracy is 0.7789 ± 0.0111

In [158]:
scores


Out[158]:
array([ 0.79466667,  0.77633333,  0.76333333,  0.77233333,  0.78759587])

The linear model is not sufficient!