In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

Classify new features:

  • child : age < 16
  • woman_or_child

In [2]:
df = pd.read_csv('train.csv', index_col=0)


def preprocess(df):
    df.columns = df.columns.map(lambda x: x.lower())

    def woman_child_or_man(passenger):
        age, sex = passenger
        if age < 16:
            return "child"
        else:
            return dict(male="man", female="woman")[sex]

    df['who'] = df[['age', 'sex']].apply(woman_child_or_man, axis=1)
    df["alone"] = ~(df.parch + df.sibsp).astype(bool)
    df["class"] = df.pclass.map({1: "First", 2: "Second", 3: "Third"})
    df["deck"] = df.cabin.str[0].map(lambda s: np.nan if s == "T" else s)

    df['sex_f'] = df.sex.map({'male': 0, 'female': 1})
    df = df.rename(columns={'pclass': 'class_f'})
    embarked_values, embarked_labels = pd.factorize(df.embarked)
    df['embarked_f'] = embarked_values
    df['embarked_f'] = df.embarked_f.replace({-1: np.nan})
    
    return df

df = preprocess(df)

Bit of visualization before diving in (for a better version see http://nbviewer.ipython.org/gist/mwaskom/8224591)


In [3]:
from missmap import missmap
missmap(df)


Out[3]:
<matplotlib.axes._subplots.AxesSubplot at 0x10e2fc7b8>

Demographics on Passengers


In [4]:
pal = dict(man="#4682B4", woman="#CD5C5C", child="#2E8B57", male="#6495ED", female="#F08080",
           First="#33CC99", Second="#006699", Third="#E3CEF6")

sns.factorplot("who", data=df, palette=pal)


Out[4]:
<seaborn.axisgrid.FacetGrid at 0x118404080>

In [5]:
sns.factorplot("class", data=df, hue="who", palette=pal)


Out[5]:
<seaborn.axisgrid.FacetGrid at 0x1181f8e10>

In [6]:
ax = sns.kdeplot(df.age, shade=True)
# ax.set_xlim(0, 80)  # matplotlib doesn't redraw



In [7]:
g = sns.FacetGrid(df, hue="who", aspect=3, palette=pal)
g.map(sns.kdeplot, "age", shade=True)


Out[7]:
<seaborn.axisgrid.FacetGrid at 0x11895eda0>

In [8]:
# who's age is missing
df['bad_age'] = pd.isnull(df.age)
df.groupby(['sex', 'class'])['bad_age'].agg('sum')


Out[8]:
sex     class 
female  First      9
        Second     2
        Third     42
male    First     21
        Second     9
        Third     94
Name: bad_age, dtype: float64

In [9]:
# relative
df['bad_age'] = pd.isnull(df.age)
df.groupby(['sex', 'class'])['bad_age'].agg('mean')


Out[9]:
sex     class 
female  First     0.095745
        Second    0.026316
        Third     0.291667
male    First     0.172131
        Second    0.083333
        Third     0.270893
Name: bad_age, dtype: float64

In [10]:
sns.factorplot("class", data=df, palette="PuBu_d")


Out[10]:
<seaborn.axisgrid.FacetGrid at 0x118a2e3c8>

In [11]:
g = sns.FacetGrid(df, hue="class", aspect=3, palette=pal)
g.map(sns.kdeplot, "age", shade=True)
#g.set(xlim=(0, 80))


Out[11]:
<seaborn.axisgrid.FacetGrid at 0x118b45198>

In [12]:
fg = sns.FacetGrid(df, col="sex", row="class", hue="sex", size=2.5, aspect=2.5, palette=pal)
fg.map(sns.kdeplot, "age", shade=True)
fg.map(sns.rugplot, "age")
sns.despine(left=True)
fg.set(xlim=(0, 80));



In [13]:
sns.factorplot("deck", hue="class", data=df, palette="BuPu_d");


Who Survived?


In [14]:
sns.factorplot("survived", data=df, palette="BuPu_r")


Out[14]:
<seaborn.axisgrid.FacetGrid at 0x118f8b9b0>

In [15]:
sns.factorplot("survived", data=df, hue="class", palette=pal)


Out[15]:
<seaborn.axisgrid.FacetGrid at 0x11991ea20>

In [16]:
sns.lmplot("age", "survived", df, hue="class", logistic=True)


Out[16]:
<seaborn.axisgrid.FacetGrid at 0x111101630>

In [17]:
sns.lmplot("age", "survived", df, hue="class", palette=pal, logistic=True)


Out[17]:
<seaborn.axisgrid.FacetGrid at 0x11d7cc278>

Modelling


In [18]:
feature_labels = ['class_f', 'sex_f', 'age', 'sibsp', 'parch', 'fare', 'alone',
                  'sex_f', 'embarked_f']
idx = df[feature_labels].dropna().index
X = df.loc[idx, feature_labels].values
y = df.survived.loc[idx]

In [19]:
test = pd.read_csv('test.csv')
test = preprocess(test)

yhats = {}  # for later

In [20]:
pd.isnull(test).any()


Out[20]:
passengerid    False
class_f        False
name           False
sex            False
age             True
sibsp          False
parch          False
ticket         False
fare            True
cabin           True
embarked       False
who            False
alone          False
class          False
deck            True
sex_f          False
embarked_f     False
dtype: bool

Logistic Regression


In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn import cross_validation as cv
from sklearn.grid_search import GridSearchCV

Naive


In [22]:
clf = LogisticRegression()
clf.fit(X, y)
yhats['yhat_logistic_0'] = clf.predict(test[feature_labels].values)
print("Training Score: ", clf.score(X, y))


Training Score:  0.800561797753

$\ell_1$ Penalty


In [23]:
clf = LogisticRegression(penalty="l1")
clf.fit(X, y)
yhats['yhat_logistic_l1'] = clf.predict(test[feature_labels].values)
print("Training Score: ", clf.score(X, y))


Training Score:  0.799157303371

Grid Search of $C$ the regularization strength.


In [24]:
lm = LogisticRegression()
clf = GridSearchCV(lm, {'C': np.linspace(.01, 1, 100)})
clf.fit(X, y)
print("Training Score: ", clf.best_score_)
yhats['yhat_logistic_gs_C'] = clf.best_estimator_.predict(test[feature_labels].values)


Training Score:  0.789325842697

In [25]:
def plot_score(clf, x_param):
    xs = []
    ys = []
    for g in clf.grid_scores_:
        xs.append(g[0][x_param])
        ys.append(g[1])
    
    fig, ax = plt.subplots()
    ax.plot(xs, ys)
    return ax

In [26]:
plot_score(clf, x_param='C')


Out[26]:
<matplotlib.axes._subplots.AxesSubplot at 0x11dfb3e10>

Ridge Regression


In [27]:
from sklearn import linear_model as lm

In [28]:
# Naive

clf = lm.RidgeClassifier()
clf.fit(X, y.values)
print("Training Score: ", clf.score(X, y))
yhats['ridge_0'] = clf.predict(test[feature_labels].values)


Training Score:  0.796348314607

In [29]:
# normalize
clf = lm.RidgeClassifier(normalize=True)
clf.fit(X, y.values)
print("Training Score: ", clf.score(X, y))
yhats['ridge_normalized'] = clf.predict(test[feature_labels].values)


Training Score:  0.780898876404

In [30]:
# gs over alpha

clf = lm.RidgeClassifierCV(alphas=np.linspace(0, 1, 100))
clf.fit(X, y.values)
print("Best alpha: ", clf.alpha_)
print("Training Score: ", clf.score(X, y))
yhats['ridge_cv_alpha'] = clf.predict(test[feature_labels])


Best alpha:  1.0
Training Score:  0.796348314607

SVC


In [31]:
from sklearn import svm

In [32]:
svc = svm.SVC()
clf = GridSearchCV(svc, param_grid={'C': np.linspace(.01, 1, 100)})
clf.fit(X, y)
print("Training score: ", clf.best_score_)
yhats['svc_cv'] = clf.predict(test[feature_labels].dropna().dropna().values)


Training score:  0.691011235955

NuSVC

Nearest Neighbors


In [33]:
from sklearn import neighbors

In [34]:
clf = neighbors.KNeighborsClassifier()
clf.fit(X, y)
print("Training Score: ", clf.score(X, y))
yhats['knn_0'] = clf.predict(test[feature_labels].dropna().values)


Training Score:  0.790730337079

In [35]:
knn = neighbors.KNeighborsClassifier()
clf = GridSearchCV(knn, param_grid={'n_neighbors': [1, 3, 5, 10, 20], 'p': [1, 2]})
clf.fit(X, y)
print("Training Score: ", clf.best_score_)
yhats['knn_cv'] = clf.predict(test[feature_labels].dropna().values)


Training Score:  0.695224719101

Decision Tree


In [36]:
from sklearn import tree

In [37]:
clf = tree.DecisionTreeClassifier()
clf.fit(X, y)
print("Training Score: ", clf.score(X, y))
yhats['tree_0'] = clf.predict(test[feature_labels].dropna().values)


Training Score:  0.98595505618

In [38]:
params = dict(criterion=['gini', 'entropy'],
              splitter=['best', 'random'],
              max_features=[None, 'auto', 'sqrt', 'log2'],
              max_depth=[None, 4, 8, 12])

dt = tree.DecisionTreeClassifier()
clf = GridSearchCV(dt, params)
clf.fit(X, y)
print("Training Score", clf.best_score_)
yhats['tree_cv'] = clf.best_estimator_.predict(test[feature_labels].dropna().values)


Training Score 0.794943820225

Ensembe Methods


In [39]:
from sklearn import ensemble

In [40]:
clf = ensemble.RandomForestClassifier()
clf.fit(X, y)
print("Training Score: ", clf.score(X, y))
yhats['random_forrest_0'] = clf.predict(test[feature_labels].dropna().values)


Training Score:  0.960674157303

In [41]:
params['n_estimators'] = [5, 10, 15, 25, 35]
params.pop('splitter')
rfc = ensemble.RandomForestClassifier()
clf = GridSearchCV(rfc, params)
clf.fit(X, y)
print("Training Score", clf.best_score_)
yhats['rfc_cv'] = clf.best_estimator_.predict(test[feature_labels].dropna().values)


Training Score 0.821629213483

In [42]:
clf = ensemble.ExtraTreesClassifier()
clf.fit(X, y)
print("Training Score: ", clf.score(X, y))
yhats['extra_tree_0'] = clf.predict(test[feature_labels].dropna().values)


Training Score:  0.98595505618

In [43]:
erfc = ensemble.ExtraTreesClassifier()
clf = GridSearchCV(erfc, params)
clf.fit(X, y)
print("Training Score", clf.best_score_)
yhats['erfc_cv'] = clf.best_estimator_.predict(test[feature_labels].dropna().values)


Training Score 0.816011235955

In [44]:
clf = ensemble.AdaBoostClassifier()
scores = cv.cross_val_score(clf, X, y)
print("Training Score: ", scores.mean())


Training Score:  0.797811107565

In [45]:
abc = ensemble.AdaBoostClassifier()

params = dict(n_estimators=[20, 30, 40, 50, 75, 100],
              learning_rate=[.1, .25, .5, .75, .9, 1])

clf = GridSearchCV(abc, params)
clf.fit(X, y)
print("Training Score: ", clf.best_score_)
yhats['ada_boost'] = clf.predict(test[feature_labels].dropna().values)


Training Score:  0.800561797753

Gradiant Boosting


In [46]:
clf = ensemble.GradientBoostingClassifier()
cv.cross_val_score(clf, X, y).mean()


Out[46]:
0.80341925799855807

In [47]:
gbc = ensemble.GradientBoostingClassifier()

clf = GridSearchCV(gbc, params)
clf.fit(X, y)
print("Training Score: ", clf.best_score_)
yhats['gbc'] = clf.predict(test[feature_labels].dropna().values)


Training Score:  0.823033707865

In [ ]: