In [3]:
# core python
from itertools import product
import re
# Data Structures
import pandas as pd
import numpy as np
# Data Visualization
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
# Prediction
import tensorflow as tf
import edward as ed
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import LabelBinarizer, StandardScaler
from keras.models import Sequential
from keras.layers.core import Dense, Activation, Dropout
from keras.optimizers import Adam
from keras.utils import np_utils
from keras.wrappers.scikit_learn import KerasClassifier
from edward.models import Normal
np.random.seed(606)
The following functions encompass a data cleaning pipeline. The function preproc at the end wraps the rest so that a single function call will return the desired data set.
In [4]:
# Imports data and splits Data into training and test sets
def split_and_clean():
X, y = select_features(pd.read_csv('train.csv'))
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 606, stratify = y)
return X_train, y_train, X_test, y_test
# Select the features of interest.
def select_features(data):
target = ['Survived']
features = ['Pclass', 'Name', 'Sex', 'Age', 'SibSp',
'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked']
dropped_features = ['Cabin', 'Ticket']
X = data[features].drop(dropped_features, axis=1)
y = data[target]
return X, y
# Fill na's with the mean (in the case of fare), and with C in the case of embarked.
def fix_na(data):
na_vars = {"Fare" : data.Fare.mean(), "Embarked" : "C"}
return data.fillna(na_vars)
# Processes categorical data into dummy vars
def create_dummies(data, cat_vars, cat_types):
cat_data = data[cat_vars].values
for i in range(len(cat_vars)):
bins = LabelBinarizer().fit_transform(cat_data[:, 0].astype(cat_types[i]))
cat_data = np.delete(cat_data, 0, axis=1)
cat_data = np.column_stack((cat_data, bins))
return cat_data
# Processes numeric data
def standardize(data, real_vars):
real_data = data[real_vars]
scale = StandardScaler()
return scale.fit_transform(real_data)
# Extract titles from the Name field and create appropriate One Hot Encoded Columns
def extract_titles(data):
title_array = data.Name
first_names = title_array.str.rsplit(', ', expand=True, n=1)
titles = first_names[1].str.rsplit('.', expand=True, n=1)
known_titles = ['Mr', 'Mrs', 'Miss', 'Master', 'Don', 'Rev', 'Dr', 'Mme', 'Ms',
'Major', 'Lady', 'Sir', 'Mlle', 'Col', 'Capt', 'the Countess',
'Jonkheer']
for title in known_titles:
try:
titles[title] = titles[0].str.contains(title).astype('int')
except:
titles[title] = 0
return titles.drop([0,1], axis=1).values
# Multilayer Perceptron for filling in ages
def age_model(features=27, n_layers=15, n_hidden = 256, dropout = 0.25, optimizer=Adam()):
model = Sequential()
model.add(Dense(n_hidden, input_shape=(features, ), activation='relu', kernel_initializer='random_normal'))
model.add(Dropout(dropout))
for i in range(n_layers):
model.add(Dense(n_hidden, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(1))
model.compile(loss='mean_squared_error', optimizer=optimizer, metrics=['mae'])
return model
# Train the age model and fill in those missing values in the dataset
def impute_ages(data):
known = data[np.isnan(data[:, -1].astype('float'))==False]
unknown = data[np.isnan(data[:, -1].astype('float'))]
y = known[:, -1]
X = known[:, :-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 606)
model = age_model()
model.fit(X_train, y_train, batch_size=64, epochs = 50,
verbose = 0, validation_split = 0.2)
ages_predicted = model.predict(unknown[:, :-1])
data[np.isnan(data[:, -1].astype('float'))] = ages_predicted
return data
# Executes the full preprocessing pipeline.
def preproc():
# Import Data & Split
X_train_, y_train, X_test_, y_test = split_and_clean()
# Fill NAs
X_train, X_test = fix_na(X_train_), fix_na(X_test_)
# Preproc Categorical Vars
cat_vars = ['Pclass', 'Sex', 'Embarked']
cat_types = ['int', 'str', 'str']
X_train_cat, X_test_cat = create_dummies(X_train, cat_vars, cat_types), create_dummies(X_test, cat_vars, cat_types)
# Preprocess Numeric Vars
real_vars = ['Fare', 'SibSp', 'Parch']
X_train_real, X_test_real = standardize(X_train, real_vars), standardize(X_test, real_vars)
# Extract Titles
X_train_titles, X_test_titles = extract_titles(X_train), extract_titles(X_test)
# Recombine
X_train, X_test = np.column_stack((X_train_cat, X_train_real, X_train_titles, X_train_.Age)), np.column_stack((X_test_cat, X_test_real, X_test_titles, X_test.Age))
# Fill Missing Ages
X_train, X_test = impute_ages(X_train), impute_ages(X_test)
return X_train, np_utils.to_categorical(y_train.values), X_test, np_utils.to_categorical(y_test.values)
Run the preproc pipeline
In [5]:
X_train, y_train, X_test, y_test = preproc()
Now we can build a Keras model. At the top we define a series of variables that we'll use in the model.
In [63]:
def create_model(features=28, n_layers=10, n_hidden = 64, dropout = 0.3, optimizer=Adam()):
model = Sequential()
model.add(Dense(n_hidden, input_shape=(features, ), activation='relu', kernel_initializer='random_normal'))
model.add(Dropout(dropout))
for i in range(n_layers):
model.add(Dense(n_hidden, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(2, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['acc'])
return model
In [6]:
def fit_model(n_layers=14, n_hidden=64, dropout=0.3, epochs=200):
model = create_model(features=X_train.shape[1], n_layers=n_layers, n_hidden=n_hidden, dropout=dropout)
model.fit(X_train, y_train , epochs=epochs, batch_size = 64)
return model
param_grid = {
'n_layers' : [5, 10],
'n_hidden' : [25, 75],
'dropout' : [0.25, 0.35]
}
In [7]:
model = KerasClassifier(build_fn=create_model, verbose=1)
grid = GridSearchCV(estimator=model, param_grid=param_grid)
grid_result = grid.fit(X_train, y_train)
In [ ]:
best_model = grid_result.best_estimator_.model
train_score, train_accuracy = best_model.evaluate(X_train, y_train)
test_score, test_accuracy = best_model.evaluate(X_test, y_test)
print('Training Score: {0}, Trainng Accuracy: {1}'.format(train_score, train_accuracy))
print('Test Score: {0}, Test Accuracy: {1}'.format(test_score, test_accuracy))
In [ ]:
print(grid_result.best_params_)
plt.hist(grid_result.cv_results_['mean_train_score'])
In [17]:
age_data = pd.read_csv('train.csv')
age_unknown = age_data[age_data.isnull()]
age_known = age_data[age_data.isnull() == False]
Out[17]:
In [ ]:
def preproc_testing():
X = pd.read_csv('test.csv')
# Fill NAs
X = fix_na(X)
# Preproc Categorical Vars
cat_vars = ['Pclass', 'Sex', 'Embarked']
cat_types = ['int', 'str', 'str']
X_cat = create_dummies(X, cat_vars, cat_types)
# Preprocess Numeric Vars
real_vars = ['Age', 'Fare', 'SibSp', 'Parch']
X_real = standardize(X, real_vars)
# Extract Titles
X_titles = extract_titles(X)
# Recombine
X = np.column_stack((X_cat, X_real, X_titles))
return X
In [ ]:
testing = preproc_testing()
prediction = grid_result.predict(testing)
In [ ]:
submission = pd.DataFrame()
submission['PassengerId'] = pd.read_csv('test.csv').PassengerId
submission['Survived'] = prediction
In [ ]:
submission.to_csv('keras_titanic.csv', index=False)
In [ ]: