In [74]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score
from sklearn.model_selection import train_test_split

%matplotlib inline

In [ ]:
def prep_features(csv_file="data/train.csv"):
    df= pd.read_csv(csv_file, index_col=0)
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    df['FamSize'] = df['Parch'] + df['SibSp']

    decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
    deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
    title_dummies = pd.get_dummies(df['Title'])
    gender_dummies = pd.get_dummies(df['Sex'])
    class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
    embarked_dummies = pd.get_dummies(df['Embarked'])
    has_age = df['Age'].notnull().astype('int')
    has_age.name = 'has_age'

    # saving to variables and using .concat() once seems to be much faster
    df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies, 
                    embarked_dummies, has_age], axis=1)

In [195]:
df = pd.read_csv("data/train.csv", index_col=0)

df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['FamSize'] = df['Parch'] + df['SibSp']

decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
title_dummies = pd.get_dummies(df['Title'])
gender_dummies = pd.get_dummies(df['Sex'])
class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
embarked_dummies = pd.get_dummies(df['Embarked'])
has_age = df['Age'].notnull().astype('int')
has_age.name = 'has_age'

# saving to variables and using .concat() once seems to be much faster
df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies, 
                embarked_dummies, has_age], axis=1)

Use Ridge regression to impute ages


In [196]:
X_train_age = df[df['Age'].notnull()][df.corr().columns]
X_needs_age = df[df['Age'].isnull()][df.corr().columns]
y_train_age = X_train_age.pop('Age')
X_needs_age = X_needs_age.drop('Age', axis=1)
ridge_age = RidgeCV()

X_train_age['Fare'].fillna(value=X_train_age['Fare'].mean(), inplace=True)

ridge_age.fit(X_train_age, y_train_age)
age_pred = np.array(ridge_age.predict(X_needs_age))

age_pred[age_pred < 0] =0

df.loc[df['Age'].isnull(), ('Age')] = age_pred

In [78]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')

    print(cm)

    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')

In [79]:
plt.hist(df['Age'])


Out[79]:
(array([  57.,   40.,  189.,  258.,  160.,   95.,   53.,   26.,   11.,    2.]),
 array([  0.,   8.,  16.,  24.,  32.,  40.,  48.,  56.,  64.,  72.,  80.]),
 <a list of 10 Patch objects>)

In [80]:
has_age.sum()


Out[80]:
714

In [81]:
plt.hist(df[(df['Survived'] == 0) & (df['has_age'] == 1)]['Age'], color='b', label='Died', bins=20)
plt.hist(df[(df['Survived'] == 1) & (df['has_age'] == 1)]['Age'], color='g', alpha=0.8, label='Survived', bins=20)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Number of people by age group')
plt.legend()
plt.show()



In [82]:
male_survivor_ages = df[(df['Survived'] == 1) & (df['has_age'] == 1) & (df['male'] == 1)]['Age']
female_survivor_ages = df[(df['Survived'] == 1) & (df['has_age'] == 1) & (df['female'] == 1)]['Age']

plt.hist(female_survivor_ages, color='b', label='Female', bins=20)
plt.hist(male_survivor_ages, color='g', label='Male', bins=20, alpha=.8)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Survivors by age and gender')
plt.legend()
plt.show()


Logistic Regression (no feature engineering)


In [83]:
X_min = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
y_min = X_min.pop('Survived')

In [84]:
df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].describe()


Out[84]:
Survived Pclass Age SibSp Parch Fare
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 0.383838 2.308642 29.670018 0.523008 0.381594 32.204208
std 0.486592 0.836071 13.678426 1.102743 0.806057 49.693429
min 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000
25% 0.000000 2.000000 21.000000 0.000000 0.000000 7.910400
50% 0.000000 3.000000 29.000000 0.000000 0.000000 14.454200
75% 1.000000 3.000000 36.359090 1.000000 0.000000 31.000000
max 1.000000 3.000000 80.000000 8.000000 6.000000 512.329200

In [85]:
X_train_minimal, X_test_minimal, y_train_minimal, y_test_minimal = train_test_split(X_min, y_min)

logreg_min = LogisticRegressionCV(n_jobs=-1)

logreg_min.fit(X_train_minimal, y_train_minimal)

y_pred = logreg_min.predict(X_test_minimal)

cfn_matrix_minimal = confusion_matrix(y_test_minimal, y_pred)
np.set_printoptions(precision=2)

In [86]:
plt.figure()
plot_confusion_matrix(cfn_matrix_minimal, ['Died', 'Surv'], title="Cfn Matrix, no normalization")


Confusion matrix, without normalization
[[129  10]
 [ 51  33]]

In [87]:
plt.figure()
plot_confusion_matrix(cfn_matrix_minimal, ['Died', 'Surv'], normalize=True, title="Cfn Matrix, no normalization")


Normalized confusion matrix
[[ 0.93  0.07]
 [ 0.61  0.39]]

In [88]:
print(classification_report(y_test_minimal, y_pred, target_names=['Died', 'Surv']))


             precision    recall  f1-score   support

       Died       0.72      0.93      0.81       139
       Surv       0.77      0.39      0.52        84

avg / total       0.74      0.73      0.70       223


In [89]:
logreg_min.score(X_test_minimal,y_test_minimal)


Out[89]:
0.726457399103139

In [90]:
logreg_min.score(X_min,y_min)


Out[90]:
0.72390572390572394

In [91]:
precision_score(y_test_minimal, y_pred)


Out[91]:
0.76744186046511631

In [92]:
recall_score(y_test_minimal, y_pred)


Out[92]:
0.39285714285714285

Logistic Regression (with added features)


In [107]:
X = df[df.corr().columns]
y = X.pop('Survived')

In [108]:
X.describe()


Out[108]:
Pclass Age SibSp Parch Fare FamSize Deck_A Deck_B Deck_C Deck_D ... the Countess female male Class_1 Class_2 Class_3 C Q S has_age
count 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.00000 ... 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000 891.000000
mean 2.308642 29.670018 0.523008 0.381594 32.204208 0.904602 0.016835 0.052750 0.066218 0.03367 ... 0.001122 0.352413 0.647587 0.242424 0.206510 0.551066 0.188552 0.086420 0.722783 0.801347
std 0.836071 13.678426 1.102743 0.806057 49.693429 1.613459 0.128725 0.223659 0.248802 0.18048 ... 0.033501 0.477990 0.477990 0.428790 0.405028 0.497665 0.391372 0.281141 0.447876 0.399210
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.00000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2.000000 21.000000 0.000000 0.000000 7.910400 0.000000 0.000000 0.000000 0.000000 0.00000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
50% 3.000000 29.000000 0.000000 0.000000 14.454200 0.000000 0.000000 0.000000 0.000000 0.00000 ... 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000
75% 3.000000 36.359090 1.000000 0.000000 31.000000 1.000000 0.000000 0.000000 0.000000 0.00000 ... 0.000000 1.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000
max 3.000000 80.000000 8.000000 6.000000 512.329200 10.000000 1.000000 1.000000 1.000000 1.00000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 39 columns


In [111]:
X_train_features, X_test_features, y_train_features, y_test_features = train_test_split(X, y)

logreg = LogisticRegressionCV(n_jobs=-1)

logreg.fit(X_train_features, y_train_features)

y_pred = logreg.predict(X_test_features)

cfn_matrix_features = confusion_matrix(y_test_features, y_pred)
np.set_printoptions(precision=2)

In [112]:
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")


Confusion matrix, without normalization
[[109  23]
 [ 17  74]]

In [99]:
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], normalize=True, title="Cfn Matrix, no normalization")


Normalized confusion matrix
[[ 0.85  0.15]
 [ 0.24  0.76]]

In [100]:
print(classification_report(y_test_features, y_pred, target_names=['Died', 'Surv']))


             precision    recall  f1-score   support

       Died       0.87      0.85      0.86       144
       Surv       0.74      0.76      0.75        79

avg / total       0.82      0.82      0.82       223


In [101]:
logreg.score(X, y)


Out[101]:
0.84287317620650959

In [102]:
precision_score(y_test_features, y_pred)


Out[102]:
0.7407407407407407

In [103]:
recall_score(y_test_features, y_pred)


Out[103]:
0.759493670886076

In [104]:
logreg.C_


Out[104]:
array([ 166.81])

In [105]:
logreg.Cs_


Out[105]:
array([  1.00e-04,   7.74e-04,   5.99e-03,   4.64e-02,   3.59e-01,
         2.78e+00,   2.15e+01,   1.67e+02,   1.29e+03,   1.00e+04])

In [106]:
sum(y_test_features)/len(y_test_features)


Out[106]:
0.35426008968609868

Decision Tree


In [114]:
#X_train_features, X_test_features, y_train_features, y_test_features

In [115]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier

In [116]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_features, y_train_features)


Out[116]:
DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [120]:
dtc_pred = dtc.predict(X_test_features)

In [122]:
cfn_matrix_features = confusion_matrix(y_test_features, dtc_pred)

In [121]:
cfn_matrix_features = confusion_matrix(y_test_features, dtc_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")


Confusion matrix, without normalization
[[109  23]
 [ 26  65]]

Random Forest


In [145]:
from math import sqrt

In [155]:
rfc = RandomForestClassifier(n_estimators=1000, max_features=14, max_depth=5, n_jobs=-1)
rfc.fit(X_train_features, y_train_features)


Out[155]:
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=5, max_features=14, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            n_estimators=1000, n_jobs=-1, oob_score=False,
            random_state=None, verbose=0, warm_start=False)

In [156]:
rfc_pred = rfc.predict(X_test_features)

In [157]:
cfn_matrix_features = confusion_matrix(y_test_features, rfc_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")


Confusion matrix, without normalization
[[119  13]
 [ 23  68]]

Run model on test data


In [197]:
df = pd.read_csv("data/test.csv", index_col=0)
# df['Deck'] = df[~df['Cabin'].isnull()]['Cabin'].str[0]
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['FamSize'] = df['Parch'] + df['SibSp']

decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
title_dummies = pd.get_dummies(df['Title'])
gender_dummies = pd.get_dummies(df['Sex'])
class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
embarked_dummies = pd.get_dummies(df['Embarked'])
has_age = df['Age'].notnull().astype('int')
has_age.name = 'has_age'

# saving to variables and using .concat() once seems to be much faster
df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies, 
                embarked_dummies, has_age], axis=1)

In [199]:
X_train_age = df[df['Age'].notnull()][df.corr().columns]
X_needs_age = df[df['Age'].isnull()][df.corr().columns]
y_train_age = X_train_age.pop('Age')
X_needs_age = X_needs_age.drop('Age', axis=1)
ridge_age = RidgeCV()

X_train_age['Fare'].fillna(value=X_train_age['Fare'].mean(), inplace=True)

ridge_age.fit(X_train_age, y_train_age)
age_pred = np.array(ridge_age.predict(X_needs_age))

age_pred[age_pred < 0] =0

df.loc[df['Age'].isnull(), ('Age')] = age_pred

In [204]:
X_train_age.describe()


Out[204]:
Pclass SibSp Parch Fare FamSize Deck_A Deck_B Deck_C Deck_D Deck_E ... Rev female male Class_1 Class_2 Class_3 C Q S has_age
count 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 ... 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.000000 332.0
mean 2.144578 0.481928 0.397590 40.982087 0.879518 0.021084 0.054217 0.105422 0.033133 0.027108 ... 0.006024 0.382530 0.617470 0.295181 0.265060 0.439759 0.246988 0.066265 0.686747 1.0
std 0.846283 0.874084 0.810651 61.135998 1.385271 0.143882 0.226787 0.307560 0.179253 0.162645 ... 0.077498 0.486739 0.486739 0.456812 0.442031 0.497107 0.431911 0.249120 0.464516 0.0
min 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.0
25% 1.000000 0.000000 0.000000 8.050000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.0
50% 2.000000 0.000000 0.000000 16.050000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 1.0
75% 3.000000 1.000000 1.000000 41.131365 1.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 1.000000 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 1.000000 1.0
max 3.000000 8.000000 6.000000 512.329200 10.000000 1.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.0

8 rows × 30 columns


In [205]:
X_train_age['Fare'].mean()


Out[205]:
40.98208731117823

In [206]:
df.describe()


Out[206]:
Pclass Age SibSp Parch Fare FamSize Deck_A Deck_B Deck_C Deck_D ... Rev female male Class_1 Class_2 Class_3 C Q S has_age
count 418.000000 418.000000 418.000000 418.000000 417.000000 418.000000 418.000000 418.000000 418.000000 418.000000 ... 418.000000 418.000000 418.000000 418.000000 418.000000 418.000000 418.000000 418.000000 418.000000 418.000000
mean 2.265550 29.870138 0.447368 0.392344 35.627188 0.839713 0.016746 0.043062 0.083732 0.028708 ... 0.004785 0.363636 0.636364 0.255981 0.222488 0.521531 0.244019 0.110048 0.645933 0.794258
std 0.841838 13.139162 0.896760 0.981429 55.907576 1.519072 0.128474 0.203240 0.277317 0.167185 ... 0.069088 0.481622 0.481622 0.436934 0.416416 0.500135 0.430019 0.313324 0.478803 0.404727
min 1.000000 0.170000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 1.000000 22.000000 0.000000 0.000000 7.895800 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000
50% 3.000000 27.461811 0.000000 0.000000 14.454200 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000
75% 3.000000 36.965880 1.000000 0.000000 31.500000 1.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 1.000000 1.000000 1.000000 0.000000 1.000000 0.000000 0.000000 1.000000 1.000000
max 3.000000 76.000000 8.000000 9.000000 512.329200 10.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000

8 rows × 31 columns


In [ ]: