In [1]:
import pandas as pd
import numpy as np
In [2]:
# pull out name into something more useful.
import re
mrPattern = re.compile('.*Mr\..*')
missPattern = re.compile('.*Miss\..*')
masterPattern = re.compile('.*Master\..*')
mrsPattern = re.compile('.*Mrs\..*')
donPattern = re.compile('.*Don\..*')
revPattern = re.compile('.*Rev\..*')
drPattern = re.compile('.*Dr\..*')
mmePattern = re.compile('.*Mme\..*')
msPattern = re.compile('.*Ms\..*')
majorPattern = re.compile('.*Major\..*')
ladyPattern = re.compile('.*Lady\..*')
sirPattern = re.compile('.*Sir\..*')
mllePattern = re.compile('.*Mlle\..*')
colPattern = re.compile('.*Col\..*')
captPattern = re.compile('.*Capt\..*')
countessPattern = re.compile('.*Countess\..*')
jonkheerPattern = re.compile('.*Jonkheer\..*')
def nameToCategory(name):
if (mrPattern.match(name)):
return 'Mr'
elif (jonkheerPattern.match(name)):
return 'Other'
elif (countessPattern.match(name)):
return 'Other'
elif (captPattern.match(name)):
return 'Other'
elif (missPattern.match(name)):
return 'Miss'
elif (masterPattern.match(name)):
return 'Master'
elif (mrsPattern.match(name)):
return 'Mrs'
elif (donPattern.match(name)):
return 'Other'
elif (revPattern.match(name)):
return 'Other'
elif (drPattern.match(name)):
return 'Other'
elif (mmePattern.match(name)):
return 'Mrs'
elif (msPattern.match(name)):
return 'Miss'
elif (majorPattern.match(name)):
return 'Other'
elif (ladyPattern.match(name)):
return 'Other'
elif (sirPattern.match(name)):
return 'Other'
elif (mllePattern.match(name)):
return 'Miss'
elif (colPattern.match(name)):
return 'Other'
return 'Other'
# raise Exception(name)
In [3]:
import math
def scrub(filePath):
data = pd.read_csv(filePath)
char_cabin = data['Cabin'].astype(str)
new_cabin = np.array([cabin[0] for cabin in char_cabin])
data['Cabin'] = pd.Categorical(new_cabin)
data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
c1Median = data.Age[data.Pclass == 1].median()
c2Median = data.Age[data.Pclass == 2].median()
c3Median = data.Age[data.Pclass == 3].median()
def medianFor(row):
if (row['Pclass'] == 1):
return c1Median
elif (row['Pclass'] == 2):
return c2Median
elif (row['Pclass'] == 3):
return c3Median
else:
raise Exception('Goofed')
def updateAge(row):
if (math.isnan(row['Age'])):
median = medianFor(row)
row['Age'] = median
return row
# Update the missing ages with the median
data = data.apply(updateAge, axis=1)
new_embarked = np.where(data['Embarked'].isnull()
, 'S'
, data['Embarked'])
data['Embarked'] = new_embarked
data['Title'] = data['Name'].apply(nameToCategory)
return data
In [4]:
def svm_scrub(df):
fixed = df.drop(['Name', 'Ticket', 'Fare', 'PassengerId'], axis=1)
fixedWithDummies = pd.get_dummies(fixed)
return fixedWithDummies
In [5]:
scrubbedData = scrub('train.csv')
scrubbedData = svm_scrub(scrubbedData)
scrubbedData_X = scrubbedData.drop('Survived', axis=1)
scrubbedData_y = scrubbedData.Survived
from sklearn.model_selection import train_test_split
In [ ]:
from sklearn.svm import SVC
In [ ]:
ballparksC = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
ballparksGamma = [1e-5, 3e-5, 1e-4, 3e-4, 1e-3]
splits = []
for i in range(100):
test, train = train_test_split( scrubbedData
, test_size=0.2)
train_X = train.drop('Survived', axis=1)
train_y = train.Survived
test_X = test.drop('Survived', axis=1)
test_y = test.Survived
splits.append([train_X, train_y, test_X, test_y])
bestScore = 0
for gamma in ballparksGamma:
for C in ballparksC:
scoreTotal = 0
for split in splits:
classifier = SVC( C=C, gamma=gamma)
classifier.fit(split[0], split[1])
scoreTotal = scoreTotal + classifier.score(X=split[2], y=split[3])
average = scoreTotal / len(splits)
if (average > bestScore):
print(str(average) + ' C: ' + str(C) + ', gamma: ' + str(gamma))
bestScore=average
bestC = C
bestGamma = gamma
classifier = SVC(C=bestC, gamma=bestGamma)
classifier.fit(scrubbedData_X, scrubbedData_y)
In [ ]:
train, test = train_test_split( scrubbedData
, test_size=0.2
, random_state = 1)
## sans y
train_X = train.drop('Survived', axis=1)
train_y = train.Survived
test_X = test.drop('Survived', axis=1)
test_y = test.Survived
svm_train_score = classifier.score(X=train_X, y=train_y)
svm_test_score = classifier.score(X=test_X, y=test_y)
print([svm_train_score, svm_test_score])
In [ ]:
test_for_submit = scrub('test.csv')
test_for_submit = svm_scrub(test_for_submit)
## Adding a label that doesn't exist in any of the test data. TODO: ask Jeremy if there is a way to do this better?
test_for_submit.insert(14, 'Cabin_T', 0)
submit_preds = classifier.predict(X=test_for_submit)
submission = pd.DataFrame({ "PassengerId": scrub('test.csv')["PassengerId"]
, "Survived":submit_preds})
submission.to_csv( "submission.csv"
, index=False)
In [ ]: