Predicting survival on the Titanic using an artificial neural network in Keras
Supervised Learning. Binary classification
This project is based on a dataset containing demographics and passenger information from 891 of the 2224 passengers and crew on board the Titanic. A description of this dataset is on the Kaggle website, where the data was obtained.
In [1]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import helper
import keras
helper.info_gpu()
helper.reproducible(seed=0) # Setup reproducible results from run to run using Keras
%matplotlib inline
In [2]:
data_path = 'data/titanic_data.csv'
target = ['Survived'] # the target will remain the same throughout the notebook
df_original = pd.read_csv(data_path)
print("{} rows \n{} columns \ntarget: {}".format(*df_original.shape, target))
In [3]:
df_original.head(3)
Out[3]:
In [4]:
df_original.describe(percentiles=[0.5])
Out[4]:
In [5]:
df_original.describe(include=['O'])
Out[5]:
In [6]:
helper.missing(df_original)
In [7]:
df = df_original.copy() # modified dataset
def enhance_features(df, dict_categories=None):
""" Enhance dataframe df """
df = df.copy()
# filter Cabin to first letter
df["Cabin"] = df["Cabin"].str[0]
# get Title from Name
df['Title'] = df["Name"].str.extract('([A-Za-z]+)\.', expand=False)
# remove low frequency values for the new feautres
fields = ['Cabin', 'Title']
df, dict_categories = helper.remove_categories(df, target=target, show=False)
# Alone passenger
df['Alone'] = ((df["SibSp"] + df["Parch"]) == 0).astype(int)
return df, dict_categories
df, dict_categories = enhance_features(df)
In [8]:
def drop_irrelevant_features(df, inplace=False):
""" Remove non-relevant columns from dataftame df (inplace) """
if not inplace:
df = df.copy()
df.drop(['PassengerId', 'Name', 'Ticket'], axis='columns', inplace=True)
if not inplace:
return df
drop_irrelevant_features(df, inplace=True)
In [9]:
df = helper.classify_data(df, target, numerical=["Age", "SibSp", "Parch", "Fare"])
pd.DataFrame(dict(df.dtypes), index=["Type"])[df.columns].head() # show data types
Out[9]:
In [10]:
helper.show_categorical(df, target=target, sharey=True)
In [11]:
helper.show_target_vs_categorical(df, target)
plt.ylim([0, 1]);
In [12]:
helper.show_numerical(df, kde=True)
In [13]:
helper.show_target_vs_numerical(df, target, jitter=0.2)
plt.ylim([-0.4, 1.4])
plt.yticks([0, 1]);
#df.groupby('Survived')['Age'].hist(alpha=0.4)
# helper.show_target_vs_numerical(df_3sigma, target, numerical, jitter=0.2)
In [14]:
helper.show_correlation(df, target)
In [15]:
sns.FacetGrid(
df, row="Sex", col="Pclass", hue="Survived", size=3, margin_titles=True).map(
plt.hist, "Age", alpha=.7).add_legend()
plt.ylim([0, 70]);
# df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(
# by='Survived', ascending=False)
# helper.show_target_vs_categorical(df.loc[(df['Age']<12) | (df['Sex']=='female')],
# target, categorical)
In [16]:
helper.missing(df)
In [17]:
plt.figure(figsize=(7, 3))
sns.countplot(data=df, x='Pclass', hue='Cabin');
In [18]:
helper.show_target_vs_categorical(df, ['Age'], figsize=(17, 2)) # Age vs categorical
In [19]:
def fill_missing_values(df, inplace=False):
""" Fill missing values of the dataframe df """
if not inplace:
df = df.copy()
# fill Embarked with mode
df['Embarked'].fillna(df['Embarked'].mode()[0], inplace=True)
# fill Cabin: the mode for grouped Pclass and Embarked
ref = df.groupby(['Pclass', 'Embarked'])['Cabin'].transform(lambda x: x.mode()[0])
df['Cabin'].fillna(ref.iloc[0], inplace=True)
# fill Age: the median for grouped Pclass and Title
ref = df.groupby(['Pclass', 'Title'])['Age'].transform('median')
df['Age'].fillna(ref, inplace=True)
# fill Title: by age and sex only (not spouse or job)
# df.loc[df['Title']=='Master','Age'].unique()
# for idx, row in df.iterrows():
# if (pd.isnull(row['Title'])):
# if row['Age'] >= 13:
# if row['Sex'] == 'male':
# df.loc[idx, 'Title'] = "Mr"
# else:
# df.loc[idx, 'Title'] = "Mrs"
# else:
# if row['Sex'] == 'male':
# df.loc[idx, 'Title'] = "Master"
# else:
# df.loc[idx, 'Title'] = "Miss"
# fill missing categorical values with the mode (if any)
categorical = list(df.select_dtypes(include=['category']))
modes = df[categorical].mode() # this solves fillna issue with mode()
for idx, f in enumerate(df[categorical]):
df[f].fillna(modes.iloc[0, idx], inplace=True)
# fill missing numeric NaN values with the median (if any)
df.fillna(df.median(), inplace=True)
if not inplace:
return df
# bins = list(range(0,80,10))
# # bins = (0, 5, 10, 15, 20, 30, 40, 50, 60)
# labels = ["{}-{}".format(i, j) for i,j in zip(bins[:-1],bins[:-1])]
# df['Age_cat'] = pd.cut(df['Age'], bins, labels=labels).astype('category')
# df = df.drop(['Age'], axis='columns')
fill_missing_values(df, inplace=True)
In [20]:
droplist = [] # features to drop from the model
# For the model 'data' instead of 'df'
data = df.copy()
df.drop(droplist, axis='columns', inplace=True)
data.head(3)
Out[20]:
In [21]:
data, scale_param = helper.scale(data)
Replace categorical features (no target) with dummy features
In [22]:
data, dict_dummies = helper.replace_by_dummies(data, target)
model_features = [f for f in data if f not in target] # sorted neural network inputs
data.head(3)
Out[22]:
In [23]:
from sklearn.model_selection import train_test_split
def split(data, target, test_size=0.15):
train, test = train_test_split(data, test_size=test_size, random_state=9,
stratify=data[target])
# Separate the data into features and target (x=features, y=target)
x_train, y_train = train.drop(target, axis=1).values, train[target].values
x_test, y_test = test.drop(target, axis=1).values, test[target].values
# _nc: non-categorical yet (needs one-hot encoding)
return x_train, y_train, x_test, y_test
x_train, y_train, x_test, y_test = split(data, target, test_size=0.2)
In [24]:
def one_hot_output(y_train, y_test):
num_classes = len(np.unique(y_train))
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)
return y_train, y_test
y_train, y_test = one_hot_output(y_train, y_test)
print("train size \t X:{} \t Y:{}".format(x_train.shape, y_train.shape))
print("test size \t X:{} \t Y:{} ".format(x_test.shape, y_test.shape))
In [25]:
from keras.models import Sequential
from keras.layers.core import Dense, Dropout
def build_nn(input_size, output_size, summary=False):
input_nodes = input_size
weights = keras.initializers.RandomNormal(stddev=0.001)
leaky_relu = keras.layers.advanced_activations.LeakyReLU(alpha=0.01)
model = Sequential()
model.add(
Dense(
input_nodes,
input_dim=input_size,
kernel_initializer=weights,
activation='relu',
bias_initializer='zero'))
model.add(leaky_relu)
model.add(Dropout(.3))
model.add(
Dense(
output_size,
activation='softmax',
kernel_initializer=weights,
bias_initializer='zero'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
if summary:
model.summary()
return model
model = build_nn(x_train.shape[1], y_train.shape[1], summary=True)
In [26]:
from time import time
model_path = os.path.join("models", "titanic.h5")
def train_nn(model, x_train, y_train, validation_data=None, path=False, show=True):
"""
Train the neural network model. If no validation_data is provided, a split for validation
will be used
"""
if show:
print('Training ....')
callbacks = [keras.callbacks.EarlyStopping(monitor='val_loss', patience=1, verbose=0)]
t0 = time()
history = model.fit(
x_train,
y_train,
epochs=1000,
batch_size=64,
verbose=0,
validation_split=0.25,
validation_data = validation_data,
callbacks=callbacks)
if show:
print("time: \t {:.1f} s".format(time() - t0))
helper.show_training(history)
if path:
model.save(path)
print("\nModel saved at", path)
return history
model = None
model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
train_nn(model, x_train, y_train, path=model_path);
In [27]:
from sklearn.model_selection import StratifiedKFold
def cv_train_nn(x_train, y_train, n_splits):
""" Create and Train models for cross validation. Return best model """
skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
score = []
best_model = None
best_acc = 0
print('Training {} models for Cross Validation ...'.format(n_splits))
for train, val in skf.split(x_train[:, 0], y_train[:, 0]):
model = None
model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
history = train_nn(
model,
x_train[train],
y_train[train],
show=False,
validation_data=(x_train[val], y_train[val]))
val_acc = history.history['val_acc'][-1]
score.append(val_acc)
if val_acc > best_acc: # save best model (fold) for evaluation and predictions
best_model = model
best_acc = val_acc
model = best_model
print('\nCross Validation accuracy: {:.3f}'.format(np.mean(score)))
return best_model
model = cv_train_nn(x_train, y_train, 4)
In [28]:
def evaluate_nn(model, x_test, y_test):
score = model.evaluate(x_test, y_test, verbose=0)
print("Test Accuracy: {:.3f}".format(score[1]))
#model = keras.models.load_model(model_path)
evaluate_nn(model, x_test, y_test)
In [29]:
y_pred = model.predict(x_test, verbose=2)
helper.binary_classification_scores(
y_test[:, 1], y_pred[:, 1], return_dataframe=True, index="Neural Network")
Out[29]:
In [30]:
def predict_manual(new_df):
"""
input: custom dataframe
"""
new_data = new_df.copy()
# force data types to previous dataframe df
for col in new_data:
new_data[col] = new_data[col].astype(df.dtypes[col])
# standardize numerical variables
new_data, _ = helper.scale(new_data, scale_param)
# replace categorical features by dummy variables (using existing dummies)
new_data, _ = helper.replace_by_dummies(new_data, target, dict_dummies)
# sort columns to match with manual entries
new_data = new_data[model_features] ## model_features: sorted list used in the model
# make predictions
prediction = model.predict(new_data.values)[:, 1]
return (prediction)
# for index, row in new_data.iterrows():
# single_pred = model.predict(np.array([row]))
# print('{}:\t {:.0f}%'.format(index,single_pred[0,1] * 100))
In [31]:
# input data format
df.describe()
Out[31]:
In [32]:
df.describe(include=['category'])
Out[32]:
In [33]:
print(list(df))
In [34]:
new_passengers = {
'Average man': [26, 1, 0, 14, 2, 'male', 'C', 'S', 'Mr', 0],
'Average woman': [26, 1, 0, 14, 2, 'female', 'C', 'S', 'Mrs', 0],
'Alone woman 3c': [26, 0, 2, 8, 3, 'female', 'C', 'S', 'Miss', 1],
'Boy 1c ': [7, 0, 2, 31, 1, 'male', 'C', 'S', 'Master', 0],
'Boy 2c ': [7, 0, 2, 14, 2, 'male', 'C', 'S', 'Master', 0],
'Boy 3c ': [7, 0, 2, 8, 3, 'male', 'C', 'S', 'Master', 0],
}
# create a dataframe with the new data
new_df = pd.DataFrame(
data=list(new_passengers.values()),
index=new_passengers.keys(),
columns= [f for f in list(df) if f not in target])
prediction = predict_manual(new_df)
new_df['Survival prob. (%)'] = (prediction * 100).astype(int)
new_df
Out[34]:
The results predicted from the model confirm the impact of the sex for the survival probabilities, as well as the class for the survival of women and children.
In [35]:
# Same dataset without:
# enhancing features
# adding new features
# filling missing values using grouped median
def non_enhanced_pipeline(df):
df = df.copy()
# select features & classify features
df.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis='columns', inplace=True)
df = helper.classify_data(df, target, numerical=["Age", "SibSp", "Parch", "Fare"])
# fill NaN
df.fillna(df.median(), inplace=True)
# standardize and create dummies
data, _ = helper.scale(df)
data, _ = helper.replace_by_dummies(data, target)
# split and one-hot output
x_train, y_train, x_test, y_test = split(data, target, test_size=0.15)
y_train, y_test = one_hot_output(y_train, y_test)
# build, train and evaluate model
model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
train_nn(model, x_train, y_train, path=False, show=False)
evaluate_nn(model, x_test, y_test)
non_enhanced_pipeline(df_original)
In [36]:
def remove_outliers_peline(df):
df = df.copy()
# transform features
df, dict_categories = enhance_features(df)
# select features & classify features
df.drop(['PassengerId', 'Name', 'Ticket'], axis='columns', inplace=True)
df = helper.classify_data(df, target, numerical=["Age", "SibSp", "Parch", "Fare"])
# remove outliers
helper.remove_outliers(df, inplace=True) # remove default values above 3 times std
# fill missing values (enhanced)
fill_missing_values(df, inplace=True)
# standardize and create dummies
data, _ = helper.scale(df)
data, _ = helper.replace_by_dummies(data, target)
# split and one-hot output
x_train, y_train, x_test, y_test = split(data, target, test_size=0.15)
y_train, y_test = one_hot_output(y_train, y_test)
# build, train and evaluate model
model = build_nn(x_train.shape[1], y_train.shape[1], summary=False)
train_nn(model, x_train, y_train, path=False, show=False)
evaluate_nn(model, x_test, y_test)
remove_outliers_peline(df_original)
In [37]:
import warnings
warnings.filterwarnings("ignore")
helper.XGBClassifier(
x_train, y_train[:,1], x_test, y_test[:,1], max_depth=4, n_estimators=400, learning_rate=0.1)
Out[37]:
In [38]:
# enhanced features
helper.ml_classification(x_train, y_train[:,1], x_test, y_test[:,1])
Out[38]:
In [39]:
from sklearn.ensemble import RandomForestClassifier
clf_random_forest = RandomForestClassifier(n_estimators = 30,
max_depth=13, class_weight='balanced', n_jobs=-1,
random_state=0).fit(x_train, np.ravel(y_train[:,1]))
In [40]:
y_pred = clf_random_forest.predict(x_test).reshape([-1, 1])
helper.binary_classification_scores(
y_test[:, 1], y_pred, return_dataframe=True, index="Random Forest")
Out[40]:
In [41]:
re = helper.feature_importances(model_features, clf_random_forest)