# Titanic: this time with cross-validation

## And also a Neural Net

``````

In [1]:

%matplotlib inline

``````
``````

In [2]:

import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
matplotlib.style.use('ggplot')

``````

# NameToCategory

Here, I create a function that's used for creating a "Title" feature by pulling out `Mr` `Miss`, etc.

For groups that only a few ended up being bucketed into, I made a catch-all `Other` group.

``````

In [3]:

``````

# Scrub

This function reads in a csv, and cleans up the data to fill in missing data

``````

In [4]:

import math

def scrub(filePath):
char_cabin = data['Cabin'].astype(str)
new_cabin = np.array([cabin[0] for cabin in char_cabin])
data['Cabin'] = pd.Categorical(new_cabin)

data['Fare'] = data['Fare'].fillna(data['Fare'].mean())

c1Median = data.Age[data.Pclass == 1].median()
c2Median = data.Age[data.Pclass == 2].median()
c3Median = data.Age[data.Pclass == 3].median()

def medianFor(row):
if (row['Pclass'] == 1):
return c1Median
elif (row['Pclass'] == 2):
return c2Median
elif (row['Pclass'] == 3):
return c3Median
else:
raise Exception('Goofed')

def updateAge(row):
if (math.isnan(row['Age'])):
median = medianFor(row)
row['Age'] = median
return row

# Update the missing ages with the median
data = data.apply(updateAge, axis=1)

new_embarked = np.where(data['Embarked'].isnull()
, 'S'
, data['Embarked'])

data['Embarked'] = new_embarked

data['Title'] = data['Name'].apply(nameToCategory)

return data

``````

Here, I make a few functions that are used to munge the data into a certain shape.

``````

In [5]:

from sklearn import linear_model
from sklearn import preprocessing

label_encoder = label_encoder = preprocessing.LabelEncoder()

def trainFeaturesFor(df):
encoded_sex = label_encoder.fit_transform(df["Sex"])
encoded_class = label_encoder.fit_transform(df["Pclass"])
encoded_cabin = label_encoder.fit_transform(df["Cabin"])
encoded_title = label_encoder.fit_transform(df["Title"])
encoded_parch = label_encoder.fit_transform(df["Parch"])

train_features = pd.DataFrame([ encoded_class
, encoded_cabin
, encoded_sex
, encoded_title
, encoded_parch
, df["Age"]
]).T
return train_features

def trainModel(df):
train_features = trainFeaturesFor(df)
log_model = linear_model.LogisticRegression()
log_model.fit( X = train_features
, y = df["Survived"])
return log_model

``````

## Splitting the data into train and test

``````

In [6]:

from sklearn.model_selection import train_test_split

completeDf = scrub('train.csv')
train, test = train_test_split( completeDf
, test_size=0.2
, random_state=1)

[len(train), len(test)]

``````
``````

Out[6]:

[712, 179]

``````

## Training and scoring the model

``````

In [7]:

log_model = trainModel(train)

preds = log_model.predict(X=trainFeaturesFor(train))

log_train_score = log_model.score( X=trainFeaturesFor(train)
, y=train['Survived'])

print(log_train_score)
pd.crosstab(preds,train["Survived"])

``````
``````

0.800561797753

Out[7]:

Survived
0
1

row_0

0
384
83

1
59
186

``````

## Scoring the model on the test data

``````

In [8]:

test_preds = log_model.predict(X=trainFeaturesFor(test))

log_score = log_model.score( X=trainFeaturesFor(test)
, y=test['Survived'])
print(log_score)
pd.crosstab(test_preds, test["Survived"])

``````
``````

0.787709497207

Out[8]:

Survived
0
1

row_0

0
90
22

1
16
51

``````

# Neural Net

``````

In [9]:

def nn_scrub(df):
df = df.drop(['Name', 'Ticket', 'Fare', 'PassengerId'], axis=1)

df = pd.get_dummies(df)
return df

``````
``````

In [10]:

nn_df = scrub('train.csv')
nn_df = nn_scrub(nn_df)

nn_df.tail()

``````
``````

Out[10]:

Survived
Pclass
Age
SibSp
Parch
Sex_female
Sex_male
Cabin_A
Cabin_B
Cabin_C
...
Cabin_T
Cabin_n
Embarked_C
Embarked_Q
Embarked_S
Title_Master
Title_Miss
Title_Mr
Title_Mrs
Title_Other

886
0
2
27.0
0
0
0
1
0
0
0
...
0
1
0
0
1
0
0
0
0
1

887
1
1
19.0
0
0
1
0
0
1
0
...
0
0
0
0
1
0
1
0
0
0

888
0
3
24.0
1
2
1
0
0
0
0
...
0
1
0
0
1
0
1
0
0
0

889
1
1
26.0
0
0
0
1
0
0
1
...
0
0
1
0
0
0
0
1
0
0

890
0
3
32.0
0
0
0
1
0
0
0
...
0
1
0
1
0
0
0
1
0
0

5 rows × 24 columns

``````

## Splitting into train and test

``````

In [11]:

from sklearn.model_selection import train_test_split

nn_train, nn_test = train_test_split( nn_df
, test_size=0.2
, random_state=3)

[len(train), len(test)]

``````
``````

Out[11]:

[712, 179]

``````
``````

In [12]:

from sklearn.neural_network import MLPClassifier

X = nn_train.drop('Survived', axis=1)
y = nn_train.Survived

clf = MLPClassifier( solver='lbfgs'
, activation='logistic'
, alpha=1e-3
, hidden_layer_sizes=(25, 25)
, random_state=1)
clf.fit(X, y)

``````
``````

Out[12]:

MLPClassifier(activation='logistic', alpha=0.001, batch_size='auto',
beta_1=0.9, beta_2=0.999, early_stopping=False, epsilon=1e-08,
hidden_layer_sizes=(25, 25), learning_rate='constant',
learning_rate_init=0.001, max_iter=200, momentum=0.9,
nesterovs_momentum=True, power_t=0.5, random_state=1, shuffle=True,
solver='lbfgs', tol=0.0001, validation_fraction=0.1, verbose=False,
warm_start=False)

``````
``````

In [13]:

sizes = [5, 10, 15, 20, 25]

bestScore = 0
def classifierFor(alpha, hiddenLayer, X, y):
tempClf = MLPClassifier( solver='lbfgs'
, activation='logistic'
, alpha=alpha
, hidden_layer_sizes=hiddenLayer
, random_state=1)
tempClf.fit(X, y)
return tempClf

for alpha in [3e-5, 1e-4, 3e-4, 1e-3]:
maxForAlpha = 0
for j in sizes:
for k in sizes:
for l in sizes:
hiddenLayer = (j, k, l)
runs = 5
total = 0
for i in range(runs):
nn_train, nn_test = train_test_split(nn_df, test_size=0.2)
X = nn_train.drop('Survived', axis=1)
y = nn_train.Survived
tempClf = classifierFor(alpha, hiddenLayer, X, y)
nn_score = tempClf.score( X=nn_test.drop('Survived', axis=1), y=nn_test.Survived)
total = total + nn_score
averageScore = total / runs
if (averageScore > bestScore):
bestScore = averageScore
print('averageScore=' + str(averageScore) + ' alpha=' + str(alpha) + ' hiddenLayers=' + str(hiddenLayer))
clf = tempClf
if (nn_score > maxForAlpha):
maxForAlpha = nn_score
maxForAlphaHidden = hiddenLayer
print('Max for alpha of ' + str(alpha) + ':::  nn_score= ' + str(maxForAlpha) + ' hiddenLayers=' + str(maxForAlphaHidden))

# This causes a slight? overfit of the model. Is there a way I can do
# something like this to get a good hiddenlayer makeup that's better
# than me just randomly guessing?

``````
``````

averageScore=0.821229050279 alpha=3e-05 hiddenLayers=(5, 5, 5)
averageScore=0.840223463687 alpha=3e-05 hiddenLayers=(5, 5, 10)
averageScore=0.845810055866 alpha=3e-05 hiddenLayers=(5, 10, 20)
averageScore=0.846927374302 alpha=3e-05 hiddenLayers=(5, 10, 25)
averageScore=0.853631284916 alpha=3e-05 hiddenLayers=(10, 25, 15)
Max for alpha of 3e-05:::  nn_score= 0.882681564246 hiddenLayers=(5, 10, 25)
averageScore=0.856983240223 alpha=0.0001 hiddenLayers=(10, 20, 20)
Max for alpha of 0.0001:::  nn_score= 0.882681564246 hiddenLayers=(15, 5, 15)
Max for alpha of 0.0003:::  nn_score= 0.882681564246 hiddenLayers=(5, 20, 5)
Max for alpha of 0.001:::  nn_score= 0.888268156425 hiddenLayers=(5, 10, 15)

``````
``````

In [14]:

nn_preds = clf.predict(nn_test.drop('Survived', axis=1))

nn_train_score = clf.score( X=nn_train.drop('Survived', axis=1)
, y=nn_train['Survived'])

nn_score = clf.score( X=nn_test.drop('Survived', axis=1)
, y=nn_test["Survived"])

print(nn_score)

pd.crosstab(nn_preds, nn_test["Survived"])

``````
``````

0.877094972067

Out[14]:

Survived
0
1

row_0

0
111
16

1
6
46

``````
``````

In [15]:

print([log_train_score, log_score])
print([nn_train_score, nn_score])

``````
``````

[0.800561797752809, 0.78770949720670391]
[0.8455056179775281, 0.87709497206703912]

``````

## Setting up data for submit.

``````

In [16]:

nn_titanic_test = scrub('test.csv')
nn_titanic_test = nn_scrub(nn_titanic_test)

## Adding a label that doesn't exist in any of the test data. TODO: ask Jeremy if there is a way to do this better?
nn_titanic_test.insert(14, 'Cabin_T', 0)

submit_preds = clf.predict( X=nn_titanic_test)

submission = pd.DataFrame({ "PassengerId": scrub('test.csv')["PassengerId"]
, "Survived":submit_preds})

submission.to_csv( "submission.csv"
, index=False)

``````

## TODO

1. Set up a voting classifier that uses the logistic nn model.
2. Is a way to do the pd.get_dummies where you can pass in all of the valid values.
``````

In [17]:

nnproba_s = clf.predict_proba(nn_test.drop('Survived', axis=1))
log_model.predict_proba(trainFeaturesFor(test))
print('I think I need to do something with these')

``````
``````

I think I need to do something with these

``````
``````

In [ ]:

``````