In [1]:
%matplotlib inline
In [2]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import scipy.stats as stats
matplotlib.style.use('ggplot')
In [3]:
In [4]:
import math
def scrub(filePath):
data = pd.read_csv(filePath)
char_cabin = data['Cabin'].astype(str)
new_cabin = np.array([cabin[0] for cabin in char_cabin])
data['Cabin'] = pd.Categorical(new_cabin)
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
c1Median = data.Age[data.Pclass == 1].median()
c2Median = data.Age[data.Pclass == 2].median()
c3Median = data.Age[data.Pclass == 3].median()
def medianFor(row):
if (row['Pclass'] == 1):
return c1Median
elif (row['Pclass'] == 2):
return c2Median
elif (row['Pclass'] == 3):
return c3Median
else:
raise Exception('Goofed')
def updateAge(row):
if (math.isnan(row['Age'])):
median = medianFor(row)
row['Age'] = median
return row
# Update the missing ages with the median
data = data.apply(updateAge, axis=1)
new_embarked = np.where(data['Embarked'].isnull()
, 'S'
, data['Embarked'])
data['Embarked'] = new_embarked
data['Title'] = data['Name'].apply(nameToCategory)
return data
Here, I make a few functions that are used to munge the data into a certain shape.
In [5]:
from sklearn import linear_model
from sklearn import preprocessing
label_encoder = label_encoder = preprocessing.LabelEncoder()
def trainFeaturesFor(df):
encoded_sex = label_encoder.fit_transform(df["Sex"])
encoded_class = label_encoder.fit_transform(df["Pclass"])
encoded_cabin = label_encoder.fit_transform(df["Cabin"])
encoded_title = label_encoder.fit_transform(df["Title"])
encoded_parch = label_encoder.fit_transform(df["Parch"])
train_features = pd.DataFrame([ encoded_class
, encoded_cabin
, encoded_sex
, encoded_title
, encoded_parch
, df["Age"]
]).T
return train_features
def trainModel(df):
train_features = trainFeaturesFor(df)
log_model = linear_model.LogisticRegression()
log_model.fit( X = train_features
, y = df["Survived"])
return log_model
In [6]:
from sklearn.model_selection import train_test_split
completeDf = scrub('train.csv')
train, test = train_test_split( completeDf
, test_size=0.2
, random_state=1)
[len(train), len(test)]
Out[6]:
In [7]:
log_model = trainModel(train)
preds = log_model.predict(X=trainFeaturesFor(train))
log_train_score = log_model.score( X=trainFeaturesFor(train)
, y=train['Survived'])
print(log_train_score)
pd.crosstab(preds,train["Survived"])
Out[7]:
In [8]:
test_preds = log_model.predict(X=trainFeaturesFor(test))
log_score = log_model.score( X=trainFeaturesFor(test)
, y=test['Survived'])
print(log_score)
pd.crosstab(test_preds, test["Survived"])
Out[8]:
In [9]:
def nn_scrub(df):
df = df.drop(['Name', 'Ticket', 'Fare', 'PassengerId'], axis=1)
df = pd.get_dummies(df)
return df
In [10]:
nn_df = scrub('train.csv')
nn_df = nn_scrub(nn_df)
nn_df.tail()
Out[10]:
In [11]:
from sklearn.model_selection import train_test_split
nn_train, nn_test = train_test_split( nn_df
, test_size=0.2
, random_state=3)
[len(train), len(test)]
Out[11]:
In [12]:
from sklearn.neural_network import MLPClassifier
X = nn_train.drop('Survived', axis=1)
y = nn_train.Survived
clf = MLPClassifier( solver='lbfgs'
, activation='logistic'
, alpha=1e-3
, hidden_layer_sizes=(25, 25)
, random_state=1)
clf.fit(X, y)
Out[12]:
In [13]:
sizes = [5, 10, 15, 20, 25]
bestScore = 0
def classifierFor(alpha, hiddenLayer, X, y):
tempClf = MLPClassifier( solver='lbfgs'
, activation='logistic'
, alpha=alpha
, hidden_layer_sizes=hiddenLayer
, random_state=1)
tempClf.fit(X, y)
return tempClf
for alpha in [3e-5, 1e-4, 3e-4, 1e-3]:
maxForAlpha = 0
for j in sizes:
for k in sizes:
for l in sizes:
hiddenLayer = (j, k, l)
runs = 5
total = 0
for i in range(runs):
nn_train, nn_test = train_test_split(nn_df, test_size=0.2)
X = nn_train.drop('Survived', axis=1)
y = nn_train.Survived
tempClf = classifierFor(alpha, hiddenLayer, X, y)
nn_score = tempClf.score( X=nn_test.drop('Survived', axis=1), y=nn_test.Survived)
total = total + nn_score
averageScore = total / runs
if (averageScore > bestScore):
bestScore = averageScore
print('averageScore=' + str(averageScore) + ' alpha=' + str(alpha) + ' hiddenLayers=' + str(hiddenLayer))
clf = tempClf
if (nn_score > maxForAlpha):
maxForAlpha = nn_score
maxForAlphaHidden = hiddenLayer
print('Max for alpha of ' + str(alpha) + '::: nn_score= ' + str(maxForAlpha) + ' hiddenLayers=' + str(maxForAlphaHidden))
# This causes a slight? overfit of the model. Is there a way I can do
# something like this to get a good hiddenlayer makeup that's better
# than me just randomly guessing?
In [14]:
nn_preds = clf.predict(nn_test.drop('Survived', axis=1))
nn_train_score = clf.score( X=nn_train.drop('Survived', axis=1)
, y=nn_train['Survived'])
nn_score = clf.score( X=nn_test.drop('Survived', axis=1)
, y=nn_test["Survived"])
print(nn_score)
pd.crosstab(nn_preds, nn_test["Survived"])
Out[14]:
In [15]:
print([log_train_score, log_score])
print([nn_train_score, nn_score])
In [16]:
nn_titanic_test = scrub('test.csv')
nn_titanic_test = nn_scrub(nn_titanic_test)
## Adding a label that doesn't exist in any of the test data. TODO: ask Jeremy if there is a way to do this better?
nn_titanic_test.insert(14, 'Cabin_T', 0)
submit_preds = clf.predict( X=nn_titanic_test)
submission = pd.DataFrame({ "PassengerId": scrub('test.csv')["PassengerId"]
, "Survived":submit_preds})
submission.to_csv( "submission.csv"
, index=False)
In [17]:
nnproba_s = clf.predict_proba(nn_test.drop('Survived', axis=1))
log_model.predict_proba(trainFeaturesFor(test))
print('I think I need to do something with these')
In [ ]: