In [74]:
    
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score
from sklearn.model_selection import train_test_split
%matplotlib inline
    
In [ ]:
    
def prep_features(csv_file="data/train.csv"):
    df= pd.read_csv(csv_file, index_col=0)
    df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
    df['FamSize'] = df['Parch'] + df['SibSp']
    decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
    deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
    title_dummies = pd.get_dummies(df['Title'])
    gender_dummies = pd.get_dummies(df['Sex'])
    class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
    embarked_dummies = pd.get_dummies(df['Embarked'])
    has_age = df['Age'].notnull().astype('int')
    has_age.name = 'has_age'
    # saving to variables and using .concat() once seems to be much faster
    df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies, 
                    embarked_dummies, has_age], axis=1)
    
In [195]:
    
df = pd.read_csv("data/train.csv", index_col=0)
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['FamSize'] = df['Parch'] + df['SibSp']
decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
title_dummies = pd.get_dummies(df['Title'])
gender_dummies = pd.get_dummies(df['Sex'])
class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
embarked_dummies = pd.get_dummies(df['Embarked'])
has_age = df['Age'].notnull().astype('int')
has_age.name = 'has_age'
# saving to variables and using .concat() once seems to be much faster
df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies, 
                embarked_dummies, has_age], axis=1)
    
In [196]:
    
X_train_age = df[df['Age'].notnull()][df.corr().columns]
X_needs_age = df[df['Age'].isnull()][df.corr().columns]
y_train_age = X_train_age.pop('Age')
X_needs_age = X_needs_age.drop('Age', axis=1)
ridge_age = RidgeCV()
X_train_age['Fare'].fillna(value=X_train_age['Fare'].mean(), inplace=True)
ridge_age.fit(X_train_age, y_train_age)
age_pred = np.array(ridge_age.predict(X_needs_age))
age_pred[age_pred < 0] =0
df.loc[df['Age'].isnull(), ('Age')] = age_pred
    
In [78]:
    
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
    """
    This function prints and plots the confusion matrix.
    Normalization can be applied by setting `normalize=True`.
    """
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=45)
    plt.yticks(tick_marks, classes)
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')
    print(cm)
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, cm[i, j],
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")
    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    
In [79]:
    
plt.hist(df['Age'])
    
    Out[79]:
    
In [80]:
    
has_age.sum()
    
    Out[80]:
In [81]:
    
plt.hist(df[(df['Survived'] == 0) & (df['has_age'] == 1)]['Age'], color='b', label='Died', bins=20)
plt.hist(df[(df['Survived'] == 1) & (df['has_age'] == 1)]['Age'], color='g', alpha=0.8, label='Survived', bins=20)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Number of people by age group')
plt.legend()
plt.show()
    
    
In [82]:
    
male_survivor_ages = df[(df['Survived'] == 1) & (df['has_age'] == 1) & (df['male'] == 1)]['Age']
female_survivor_ages = df[(df['Survived'] == 1) & (df['has_age'] == 1) & (df['female'] == 1)]['Age']
plt.hist(female_survivor_ages, color='b', label='Female', bins=20)
plt.hist(male_survivor_ages, color='g', label='Male', bins=20, alpha=.8)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Survivors by age and gender')
plt.legend()
plt.show()
    
    
In [83]:
    
X_min = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
y_min = X_min.pop('Survived')
    
In [84]:
    
df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].describe()
    
    Out[84]:
In [85]:
    
X_train_minimal, X_test_minimal, y_train_minimal, y_test_minimal = train_test_split(X_min, y_min)
logreg_min = LogisticRegressionCV(n_jobs=-1)
logreg_min.fit(X_train_minimal, y_train_minimal)
y_pred = logreg_min.predict(X_test_minimal)
cfn_matrix_minimal = confusion_matrix(y_test_minimal, y_pred)
np.set_printoptions(precision=2)
    
In [86]:
    
plt.figure()
plot_confusion_matrix(cfn_matrix_minimal, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
    
    
    
In [87]:
    
plt.figure()
plot_confusion_matrix(cfn_matrix_minimal, ['Died', 'Surv'], normalize=True, title="Cfn Matrix, no normalization")
    
    
    
In [88]:
    
print(classification_report(y_test_minimal, y_pred, target_names=['Died', 'Surv']))
    
    
In [89]:
    
logreg_min.score(X_test_minimal,y_test_minimal)
    
    Out[89]:
In [90]:
    
logreg_min.score(X_min,y_min)
    
    Out[90]:
In [91]:
    
precision_score(y_test_minimal, y_pred)
    
    Out[91]:
In [92]:
    
recall_score(y_test_minimal, y_pred)
    
    Out[92]:
In [107]:
    
X = df[df.corr().columns]
y = X.pop('Survived')
    
In [108]:
    
X.describe()
    
    Out[108]:
In [111]:
    
X_train_features, X_test_features, y_train_features, y_test_features = train_test_split(X, y)
logreg = LogisticRegressionCV(n_jobs=-1)
logreg.fit(X_train_features, y_train_features)
y_pred = logreg.predict(X_test_features)
cfn_matrix_features = confusion_matrix(y_test_features, y_pred)
np.set_printoptions(precision=2)
    
In [112]:
    
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
    
    
    
In [99]:
    
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], normalize=True, title="Cfn Matrix, no normalization")
    
    
    
In [100]:
    
print(classification_report(y_test_features, y_pred, target_names=['Died', 'Surv']))
    
    
In [101]:
    
logreg.score(X, y)
    
    Out[101]:
In [102]:
    
precision_score(y_test_features, y_pred)
    
    Out[102]:
In [103]:
    
recall_score(y_test_features, y_pred)
    
    Out[103]:
In [104]:
    
logreg.C_
    
    Out[104]:
In [105]:
    
logreg.Cs_
    
    Out[105]:
In [106]:
    
sum(y_test_features)/len(y_test_features)
    
    Out[106]:
In [114]:
    
#X_train_features, X_test_features, y_train_features, y_test_features
    
In [115]:
    
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
    
In [116]:
    
dtc = DecisionTreeClassifier()
dtc.fit(X_train_features, y_train_features)
    
    Out[116]:
In [120]:
    
dtc_pred = dtc.predict(X_test_features)
    
In [122]:
    
cfn_matrix_features = confusion_matrix(y_test_features, dtc_pred)
    
In [121]:
    
cfn_matrix_features = confusion_matrix(y_test_features, dtc_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
    
    
    
In [145]:
    
from math import sqrt
    
In [155]:
    
rfc = RandomForestClassifier(n_estimators=1000, max_features=14, max_depth=5, n_jobs=-1)
rfc.fit(X_train_features, y_train_features)
    
    Out[155]:
In [156]:
    
rfc_pred = rfc.predict(X_test_features)
    
In [157]:
    
cfn_matrix_features = confusion_matrix(y_test_features, rfc_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
    
    
    
In [197]:
    
df = pd.read_csv("data/test.csv", index_col=0)
# df['Deck'] = df[~df['Cabin'].isnull()]['Cabin'].str[0]
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['FamSize'] = df['Parch'] + df['SibSp']
decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
title_dummies = pd.get_dummies(df['Title'])
gender_dummies = pd.get_dummies(df['Sex'])
class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
embarked_dummies = pd.get_dummies(df['Embarked'])
has_age = df['Age'].notnull().astype('int')
has_age.name = 'has_age'
# saving to variables and using .concat() once seems to be much faster
df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies, 
                embarked_dummies, has_age], axis=1)
    
In [199]:
    
X_train_age = df[df['Age'].notnull()][df.corr().columns]
X_needs_age = df[df['Age'].isnull()][df.corr().columns]
y_train_age = X_train_age.pop('Age')
X_needs_age = X_needs_age.drop('Age', axis=1)
ridge_age = RidgeCV()
X_train_age['Fare'].fillna(value=X_train_age['Fare'].mean(), inplace=True)
ridge_age.fit(X_train_age, y_train_age)
age_pred = np.array(ridge_age.predict(X_needs_age))
age_pred[age_pred < 0] =0
df.loc[df['Age'].isnull(), ('Age')] = age_pred
    
In [204]:
    
X_train_age.describe()
    
    Out[204]:
In [205]:
    
X_train_age['Fare'].mean()
    
    Out[205]:
In [206]:
    
df.describe()
    
    Out[206]:
In [ ]: