In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
In [2]:
df = pd.read_csv('./data/weight-height.csv')
In [3]:
df.head()
Out[3]:
In [4]:
df.plot(kind = 'scatter',
figsize = (7, 7),
x = 'Height',
y = 'Weight',
title = 'Weight and Height in adults')
Out[4]:
In [5]:
df.plot(kind = 'scatter',
figsize = (7, 7),
x = 'Height',
y = 'Weight',
title = 'Weight and Height in adults')
# Here we're plotting the red line 'by hand' with fixed values
# We'll try to learn this line with an algorithm below
plt.plot([55, 78], [75, 250], color='red', linewidth=3)
Out[5]:
In [6]:
def line(x, w=0, b=0):
return x * w + b
In [7]:
x = np.linspace(55, 80, 100)
In [8]:
x
Out[8]:
In [9]:
yhat = line(x, w = 0, b = 0)
In [10]:
yhat
Out[10]:
In [11]:
df.plot(kind = 'scatter',
figsize = (7, 7),
x = 'Height',
y = 'Weight',
title = 'Weight and Height in adults')
plt.plot(x, yhat, color='red', linewidth=3)
Out[11]:
In [12]:
def mean_squared_error(y_true, y_pred):
s = (y_true - y_pred) ** 2
return s.mean()
In [13]:
X = df[['Height']].values
y_true = df['Weight'].values
In [14]:
y_true
Out[14]:
In [15]:
y_pred = line(X)
In [16]:
y_pred
Out[16]:
In [17]:
mean_squared_error(y_true, y_pred.ravel())
Out[17]:
In [18]:
plt.figure(figsize=(10, 5))
# we are going to draw 2 plots in the same figure
# first plot, data and a few lines
ax1 = plt.subplot(121)
df.plot(kind = 'scatter',
x = 'Height',
y = 'Weight',
title = 'Weight and Height in adults', ax=ax1)
# let's explore the cost function for a few values of b between -100 and +150
bbs = np.array([-100, -50, 0, 50, 100, 150])
mses = [] # we will append the values of the cost here, for each line
for b in bbs:
y_pred = line(X, w = 2, b = b)
mse = mean_squared_error(y_true, y_pred)
mses.append(mse)
plt.plot(X, y_pred)
# second plot: Cost function
ax2 = plt.subplot(122)
plt.plot(bbs, mses, 'o-')
plt.title('Cost as a function of b')
plt.xlabel('b')
Out[18]:
In [19]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import Adam, SGD
In [20]:
model = Sequential()
In [21]:
model.add(Dense(1, input_shape=(1,)))
In [22]:
model.summary()
In [23]:
model.compile(Adam(lr = 0.8), loss = 'mean_squared_error')
In [24]:
model.fit(X, y_true, epochs = 40)
Out[24]:
In [25]:
y_pred = model.predict(X)
In [26]:
df.plot(kind = 'scatter',
x = 'Height',
y = 'Weight',
title = 'Weight and Height in adults')
plt.plot(X, y_pred, color='red')
Out[26]:
In [27]:
W, B = model.get_weights()
In [28]:
W
Out[28]:
In [29]:
B
Out[29]:
In [30]:
from sklearn.metrics import r2_score
In [31]:
print("The R2 score is {:0.3f}".format(r2_score(y_true, y_pred)))
In [32]:
from sklearn.model_selection import train_test_split
In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y_true,
test_size=0.2)
In [34]:
len(X_train)
Out[34]:
In [35]:
len(X_test)
Out[35]:
In [36]:
W[0, 0] = 0.0
B[0] = 0.0
model.set_weights((W, B))
In [37]:
model.fit(X_train, y_train, epochs = 50, verbose = 0)
Out[37]:
In [38]:
y_train_pred = model.predict(X_train).ravel()
y_test_pred = model.predict(X_test).ravel()
In [39]:
from sklearn.metrics import mean_squared_error as mse
In [40]:
print("The Mean Squared Error on the Train set is:\t{:0.1f}".format(mse(y_train, y_train_pred)))
print("The Mean Squared Error on the Test set is:\t{:0.1f}".format(mse(y_test, y_test_pred)))
In [41]:
print("The R2 score on the Train set is:\t{:0.3f}".format(r2_score(y_train, y_train_pred)))
print("The R2 score on the Test set is:\t{:0.3f}".format(r2_score(y_test, y_test_pred)))
In [42]:
df = pd.read_csv('./data/user_visit_duration.csv')
In [43]:
df.head()
Out[43]:
In [44]:
df.plot(kind = 'scatter',
x='Time (min)',
y='Buy')
Out[44]:
In [45]:
model = Sequential()
model.add(Dense(1, input_shape=(1,), activation='sigmoid'))
In [46]:
model.compile(SGD(lr = 0.5),
loss = 'binary_crossentropy',
metrics=['accuracy'])
In [47]:
model.summary()
In [48]:
X = df[['Time (min)']].values
y = df['Buy'].values
model.fit(X, y, epochs = 25)
Out[48]:
In [49]:
ax = df.plot(kind='scatter',
x = 'Time (min)',
y ='Buy',
title = 'Purchase behavior VS time spent on site')
temp = np.linspace(0, 4)
ax.plot(temp, model.predict(temp), color = 'orange')
plt.legend(['model', 'data'])
Out[49]:
In [50]:
temp_class = model.predict(temp) > 0.5
In [51]:
ax = df.plot(kind = 'scatter',
x = 'Time (min)',
y = 'Buy',
title = 'Purchase behavior VS time spent on site')
temp = np.linspace(0, 4)
ax.plot(temp, temp_class, color = 'orange')
plt.legend(['model', 'data'])
Out[51]:
In [52]:
y_pred = model.predict(X)
y_class_pred = y_pred > 0.5
In [53]:
from sklearn.metrics import accuracy_score
In [54]:
print("The accuracy score is {:0.3f}".format(accuracy_score(y, y_class_pred)))
In [55]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)
In [56]:
params = model.get_weights()
params = [np.zeros(w.shape) for w in params]
model.set_weights(params)
In [57]:
print("The accuracy score is {:0.3f}".format(accuracy_score(y, model.predict(X) > 0.5)))
In [58]:
model.fit(X_train, y_train, epochs = 25, verbose = 0)
Out[58]:
In [59]:
print("The train accuracy score is {:0.3f}".format(accuracy_score(y_train, model.predict(X_train) > 0.5)))
print("The test accuracy score is {:0.3f}".format(accuracy_score(y_test, model.predict(X_test) > 0.5)))
In [60]:
from keras.wrappers.scikit_learn import KerasClassifier
In [61]:
def build_logistic_regression_model():
model = Sequential()
model.add(Dense(1,
input_shape = (1,),
activation = 'sigmoid'))
model.compile(SGD(lr = 0.5),
loss = 'binary_crossentropy',
metrics = ['accuracy'])
return model
In [62]:
model = KerasClassifier(build_fn = build_logistic_regression_model,
epochs = 25,
verbose = 0)
In [63]:
from sklearn.model_selection import cross_val_score, KFold
In [64]:
cv = KFold(3, shuffle = True)
In [65]:
scores = cross_val_score(model, X, y, cv = cv)
In [66]:
scores
Out[66]:
In [67]:
print("The cross validation accuracy is {:0.4f} ± {:0.4f}".format(scores.mean(), scores.std()))
In [68]:
from sklearn.metrics import confusion_matrix
In [69]:
confusion_matrix(y, y_class_pred)
Out[69]:
In [70]:
def pretty_confusion_matrix(y_true, y_pred, labels = ["False", "True"]):
cm = confusion_matrix(y_true, y_pred)
pred_labels = ['Predicted '+ l for l in labels]
df = pd.DataFrame(cm, index = labels, columns = pred_labels)
return df
In [71]:
pretty_confusion_matrix(y, y_class_pred, ['Not Buy', 'Buy'])
Out[71]:
In [72]:
from sklearn.metrics import precision_score, recall_score, f1_score
In [73]:
print("Precision:\t{:0.3f}".format(precision_score(y, y_class_pred)))
print("Recall: \t{:0.3f}".format(recall_score(y, y_class_pred)))
print("F1 Score:\t{:0.3f}".format(f1_score(y, y_class_pred)))
In [74]:
from sklearn.metrics import classification_report
In [75]:
print(classification_report(y, y_class_pred))
In [76]:
df = pd.read_csv('./data/weight-height.csv')
df.head()
Out[76]:
In [77]:
df['Gender'].unique()
Out[77]:
In [78]:
pd.get_dummies(df['Gender'], prefix = 'Gender').head()
Out[78]:
In [79]:
df['Height (feet)'] = df['Height']/12.0
df['Weight (100 lbs)'] = df['Weight']/100.0
In [80]:
df.describe().round(2)
Out[80]:
In [81]:
from sklearn.preprocessing import MinMaxScaler
mms = MinMaxScaler()
df['Weight_mms'] = mms.fit_transform(df[['Weight']])
df['Height_mms'] = mms.fit_transform(df[['Height']])
df.describe().round(2)
Out[81]:
In [82]:
from sklearn.preprocessing import StandardScaler
ss = StandardScaler()
df['Weight_ss'] = ss.fit_transform(df[['Weight']])
df['Height_ss'] = ss.fit_transform(df[['Height']])
df.describe().round(2)
Out[82]:
In [83]:
plt.figure(figsize=(15, 5))
for i, feature in enumerate(['Height', 'Height (feet)', 'Height_mms', 'Height_ss']):
plt.subplot(1, 4, i+1)
df[feature].plot(kind = 'hist',
title = feature)
plt.xlabel(feature)
You've just been hired at a real estate investment firm and they would like you to build a model for pricing houses. You are given a dataset that contains data for house prices and a few features like number of bedrooms, size in square feet and age of the house. Let's see if you can build a model that is able to predict the price. In this exercise we extend what we have learned about linear regression to a dataset with more than one feature. Here are the steps to complete it:
In [84]:
ex1 = pd.read_csv('./data/housing-data.csv')
In [85]:
ex1.head()
Out[85]:
In [86]:
ex1.shape
Out[86]:
In [87]:
plt.figure(figsize=(20, 5))
for i, feature in enumerate(ex1.columns):
plt.subplot(1, 4, i + 1)
ex1[feature].plot(kind = 'hist',
title = feature)
plt.xlabel(feature)
plt.tight_layout()
In [88]:
X = ex1[['sqft', 'bdrms', 'age']]
X.head()
Out[88]:
In [89]:
Y = ex1[['price']]
Y.head()
Out[89]:
In [90]:
X = X.values
Y = Y.values
In [91]:
X.shape
Out[91]:
In [92]:
Y.shape
Out[92]:
In [93]:
from keras.models import Sequential
from keras.layers import Dense
from keras.optimizers import SGD, Adam
In [94]:
model = Sequential()
In [95]:
model.add(Dense(1, input_shape = (3, )))
In [96]:
opt = Adam(lr = 0.8)
In [97]:
model.compile(optimizer = opt, loss = 'mean_squared_error')
In [98]:
model.summary()
In [99]:
from sklearn.model_selection import train_test_split
In [100]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
In [101]:
model.fit(X_train, Y_train, epochs = 10)
Out[101]:
In [102]:
from sklearn.metrics import r2_score
In [103]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)
train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)
print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))
In [104]:
from sklearn.preprocessing import MinMaxScaler
In [105]:
minmax = MinMaxScaler()
In [106]:
X = minmax.fit_transform(X)
Y = minmax.fit_transform(Y)
In [107]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
In [108]:
model = Sequential()
model.add(Dense(1, input_shape = (3, )))
opt = Adam(lr = 0.8)
model.compile(optimizer = opt, loss = 'mean_squared_error')
In [109]:
model.fit(X_train, Y_train, epochs = 20)
Out[109]:
In [110]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)
train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)
print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))
In [111]:
model = Sequential()
model.add(Dense(1, input_shape = (3, )))
opt = Adam(lr = 0.1)
model.compile(optimizer = opt, loss = 'mean_squared_error')
In [112]:
model.fit(X_train, Y_train, epochs = 20, verbose = 1)
Out[112]:
In [113]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)
train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)
print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))
In [114]:
model = Sequential()
model.add(Dense(1, input_shape = (3, )))
opt = SGD(lr = 0.1)
model.compile(optimizer = opt, loss = 'mean_squared_error')
In [115]:
model.fit(X_train, Y_train, epochs = 20)
Out[115]:
In [116]:
Y_train_predicted = model.predict(X_train)
Y_test_predicted = model.predict(X_test)
train_score = r2_score(Y_train, Y_train_predicted)
test_score = r2_score(Y_test, Y_test_predicted)
print('Train set score: \t{:0.3f}'.format(train_score))
print('Test set score: \t{:0.3f}'.format(test_score))
Your boss was extremely happy with your work on the housing price prediction model and decided to entrust you with a more challenging task. They've seen a lot of people leave the company recently and they would like to understand why that's happening. They have collected historical data on employees and they would like you to build a model that is able to predict which employee will leave next. The would like a model that is better than random guessing. They also prefer false negatives than false positives, in this first phase. Fields in the dataset include:
Your goal is to predict the binary outcome variable left
using the rest of the data. Since the outcome is binary, this is a classification problem. Here are some things you may want to try out:
.head()
, .info()
and .describe()
.pd.concat
.As you will see in this exercise, the a logistic regression model is not good enough to help your boss. In the next chapter we will learn how to go beyond linear models.
This dataset comes from https://www.kaggle.com/ludobenistant/hr-analytics/ and is released under CC BY-SA 4.0 License.
In [117]:
ex2 = pd.read_csv('./data/HR_comma_sep.csv')
In [118]:
ex2.head()
Out[118]:
In [119]:
ex2.info()
In [120]:
ex2.describe()
Out[120]:
In [121]:
# Accuracy if predicted that all stay
# left = 1 : the employee already left
acc = (1 - (ex2['left'].value_counts()[1] / ex2.shape[0])) * 100
print('Predicting all would stay yields accuracy: \t{:0.2f} %'.format(acc))
In [122]:
len(ex2.columns)
Out[122]:
In [123]:
ex2['average_montly_hours'].plot(kind = 'hist',
figsize = (10, 5))
Out[123]:
In [124]:
ex2['time_spend_company'].plot(kind = 'hist',
figsize = (10, 5))
Out[124]:
In [125]:
minmax = MinMaxScaler()
In [126]:
ex2['average_montly_hours'] = minmax.fit_transform(ex2['average_montly_hours'].reshape(-1,1))
ex2['time_spend_company'] = minmax.fit_transform(ex2['time_spend_company'].reshape(-1,1))
In [127]:
ex2['average_montly_hours'].plot(kind = 'hist',
figsize = (10, 5))
Out[127]:
In [128]:
# Sales and salary are categorical data
ex2_dummies = pd.get_dummies(ex2[['sales', 'salary']])
In [129]:
ex2_dummies.head()
Out[129]:
In [130]:
del ex2['sales'], ex2['salary']
In [131]:
ex2.head()
Out[131]:
In [132]:
len(ex2.columns)
Out[132]:
In [133]:
data = pd.concat([ex2, ex2_dummies], axis = 1)
In [134]:
data.head()
Out[134]:
In [135]:
X = data.ix[:, data.columns != 'left']
In [136]:
X.head()
Out[136]:
In [137]:
len(X.columns)
Out[137]:
In [138]:
Y = data['left']
In [139]:
from sklearn.model_selection import train_test_split
In [140]:
X = X.values
Y = Y.values
In [141]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)
In [142]:
model = Sequential()
model.add(Dense(1, input_dim = X_train.shape[1], activation = 'sigmoid'))
model.compile(optimizer = Adam(lr = 0.05), loss = 'binary_crossentropy', metrics = ['accuracy'])
In [143]:
model.summary()
In [144]:
model.fit(X_train, Y_train, epochs = 20)
Out[144]:
In [145]:
Y_test_predicted = model.predict_classes(X_test)
In [146]:
from sklearn.metrics import confusion_matrix, classification_report
In [147]:
def pretty_confusion_matrix(y_true, y_pred, labels=["False", "True"]):
cm = confusion_matrix(y_true, y_pred)
pred_labels = ['Predicted '+ l for l in labels]
df = pd.DataFrame(cm, index = labels, columns = pred_labels)
return df
In [148]:
pretty_confusion_matrix(Y_test, Y_test_predicted, labels=['Stay', 'Leave'])
Out[148]:
In [149]:
print(classification_report(Y_test, Y_test_predicted))
In [150]:
from sklearn.model_selection import cross_val_score, KFold
In [151]:
from keras.wrappers.scikit_learn import KerasClassifier
In [152]:
def build_model():
model = Sequential()
model.add(Dense(1,
input_dim = 20,
activation='sigmoid'))
model.compile(Adam(lr = 0.1), 'binary_crossentropy', metrics=['accuracy'])
return model
In [153]:
model = KerasClassifier(build_fn = build_model,
epochs = 10,
verbose = 0)
In [154]:
from sklearn.model_selection import KFold, cross_val_score
In [155]:
cross_val = KFold(5, shuffle = True)
In [156]:
scores = cross_val_score(model, X, Y, cv = cross_val)
In [157]:
print("The cross validation accuracy is {:0.4f} ± {:0.4f}".format(scores.mean(), scores.std()))
In [158]:
scores
Out[158]:
The linear model is not sufficient!