In [1]:
import pandas as pd
import numpy as np

NameToCategory

Here, I create a function that's used for creating a "Title" feature by pulling out Mr Miss, etc.

For groups that only a few ended up being bucketed into, I made a catch-all Other group.


In [2]:
# pull out name into something more useful. 
import re
mrPattern = re.compile('.*Mr\..*')
missPattern = re.compile('.*Miss\..*')
masterPattern = re.compile('.*Master\..*')
mrsPattern = re.compile('.*Mrs\..*')
donPattern = re.compile('.*Don\..*')
revPattern = re.compile('.*Rev\..*')
drPattern = re.compile('.*Dr\..*')
mmePattern = re.compile('.*Mme\..*')
msPattern = re.compile('.*Ms\..*')
majorPattern = re.compile('.*Major\..*')
ladyPattern = re.compile('.*Lady\..*')
sirPattern = re.compile('.*Sir\..*')
mllePattern = re.compile('.*Mlle\..*')
colPattern = re.compile('.*Col\..*')
captPattern = re.compile('.*Capt\..*')
countessPattern = re.compile('.*Countess\..*')
jonkheerPattern = re.compile('.*Jonkheer\..*')

def nameToCategory(name):
    if (mrPattern.match(name)):
        return 'Mr'
    elif (jonkheerPattern.match(name)):
        return 'Other'
    elif (countessPattern.match(name)):
        return 'Other'
    elif (captPattern.match(name)):
        return 'Other'
    elif (missPattern.match(name)):
        return 'Miss'
    elif (masterPattern.match(name)):
        return 'Master'
    elif (mrsPattern.match(name)):
        return 'Mrs'
    elif (donPattern.match(name)):
        return 'Other'
    elif (revPattern.match(name)):
        return 'Other'
    elif (drPattern.match(name)):
        return 'Other'
    elif (mmePattern.match(name)):
        return 'Mrs'
    elif (msPattern.match(name)):
        return 'Miss'
    elif (majorPattern.match(name)):
        return 'Other'
    elif (ladyPattern.match(name)):
        return 'Other'
    elif (sirPattern.match(name)):
        return 'Other'
    elif (mllePattern.match(name)):
        return 'Miss'
    elif (colPattern.match(name)):
        return 'Other'
    return 'Other'
#    raise Exception(name)

Scrub

This function reads in a csv, and cleans up the data to fill in missing data


In [3]:
import math

def scrub(filePath):
    data = pd.read_csv(filePath)
    char_cabin = data['Cabin'].astype(str)
    new_cabin = np.array([cabin[0] for cabin in char_cabin])
    data['Cabin'] = pd.Categorical(new_cabin)

    data['Fare'] = data['Fare'].fillna(data['Fare'].mean())
    
    c1Median = data.Age[data.Pclass == 1].median()
    c2Median = data.Age[data.Pclass == 2].median()
    c3Median = data.Age[data.Pclass == 3].median()

    def medianFor(row):
        if (row['Pclass'] == 1):
            return c1Median
        elif (row['Pclass'] == 2):
            return c2Median
        elif (row['Pclass'] == 3):
            return c3Median
        else:
            raise Exception('Goofed')
    
    def updateAge(row):
        if (math.isnan(row['Age'])):
            median = medianFor(row)
            row['Age'] = median
        return row
    
    # Update the missing ages with the median
    data = data.apply(updateAge, axis=1)
    
    new_embarked = np.where(data['Embarked'].isnull()
                           , 'S'
                           , data['Embarked'])
    
    data['Embarked'] = new_embarked
    
    data['Title'] = data['Name'].apply(nameToCategory)
    
    
    return data

In [4]:
def svm_scrub(df):
    fixed = df.drop(['Name', 'Ticket', 'Fare', 'PassengerId'], axis=1)
    fixedWithDummies = pd.get_dummies(fixed)
    return fixedWithDummies

In [5]:
scrubbedData = scrub('train.csv')
scrubbedData = svm_scrub(scrubbedData)
scrubbedData_X = scrubbedData.drop('Survived', axis=1)
scrubbedData_y = scrubbedData.Survived

from sklearn.model_selection import train_test_split

SVM


In [ ]:
from sklearn.svm import SVC

Cross Validation


In [ ]:
ballparksC = [0.001, 0.003, 0.01, 0.03, 0.1, 0.3, 1, 3, 10, 30, 100]
ballparksGamma = [1e-5, 3e-5, 1e-4, 3e-4, 1e-3]

splits = []
for i in range(100):
    test, train = train_test_split( scrubbedData
                                   , test_size=0.2)
    train_X = train.drop('Survived', axis=1)
    train_y = train.Survived
    test_X = test.drop('Survived', axis=1)
    test_y = test.Survived
    splits.append([train_X, train_y, test_X, test_y])
bestScore = 0
for gamma in ballparksGamma:
    for C in ballparksC:
        scoreTotal = 0
        for split in splits:
            classifier = SVC( C=C, gamma=gamma)
            classifier.fit(split[0], split[1])
            scoreTotal = scoreTotal + classifier.score(X=split[2], y=split[3])
        average = scoreTotal / len(splits)
        if (average > bestScore):
            print(str(average) + ' C: ' + str(C) + ', gamma: ' + str(gamma))
            bestScore=average
            bestC = C
            bestGamma = gamma

classifier = SVC(C=bestC, gamma=bestGamma)
classifier.fit(scrubbedData_X, scrubbedData_y)


0.617078651685 C: 0.001, gamma: 1e-05
0.61720505618 C: 30, gamma: 1e-05
0.617373595506 C: 100, gamma: 1e-05
0.617612359551 C: 30, gamma: 3e-05
0.676811797753 C: 100, gamma: 3e-05
0.799803370787 C: 100, gamma: 0.0001
0.804733146067 C: 100, gamma: 0.0003

In [ ]:
train, test = train_test_split( scrubbedData
                               , test_size=0.2
                               , random_state = 1)

## sans y
train_X = train.drop('Survived', axis=1)
train_y = train.Survived

test_X = test.drop('Survived', axis=1)
test_y = test.Survived

svm_train_score = classifier.score(X=train_X, y=train_y)
svm_test_score = classifier.score(X=test_X, y=test_y)

print([svm_train_score, svm_test_score])

Submit


In [ ]:
test_for_submit = scrub('test.csv')
test_for_submit = svm_scrub(test_for_submit)

## Adding a label that doesn't exist in any of the test data. TODO: ask Jeremy if there is a way to do this better?
test_for_submit.insert(14, 'Cabin_T', 0)

submit_preds = classifier.predict(X=test_for_submit)

submission = pd.DataFrame({ "PassengerId": scrub('test.csv')["PassengerId"]
                          , "Survived":submit_preds})

submission.to_csv( "submission.csv"
                 , index=False)

In [ ]: