In [74]:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import RidgeCV, LogisticRegressionCV
from sklearn.metrics import confusion_matrix, classification_report, precision_score, recall_score
from sklearn.model_selection import train_test_split
%matplotlib inline
In [ ]:
def prep_features(csv_file="data/train.csv"):
df= pd.read_csv(csv_file, index_col=0)
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['FamSize'] = df['Parch'] + df['SibSp']
decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
title_dummies = pd.get_dummies(df['Title'])
gender_dummies = pd.get_dummies(df['Sex'])
class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
embarked_dummies = pd.get_dummies(df['Embarked'])
has_age = df['Age'].notnull().astype('int')
has_age.name = 'has_age'
# saving to variables and using .concat() once seems to be much faster
df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies,
embarked_dummies, has_age], axis=1)
In [195]:
df = pd.read_csv("data/train.csv", index_col=0)
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['FamSize'] = df['Parch'] + df['SibSp']
decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
title_dummies = pd.get_dummies(df['Title'])
gender_dummies = pd.get_dummies(df['Sex'])
class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
embarked_dummies = pd.get_dummies(df['Embarked'])
has_age = df['Age'].notnull().astype('int')
has_age.name = 'has_age'
# saving to variables and using .concat() once seems to be much faster
df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies,
embarked_dummies, has_age], axis=1)
In [196]:
X_train_age = df[df['Age'].notnull()][df.corr().columns]
X_needs_age = df[df['Age'].isnull()][df.corr().columns]
y_train_age = X_train_age.pop('Age')
X_needs_age = X_needs_age.drop('Age', axis=1)
ridge_age = RidgeCV()
X_train_age['Fare'].fillna(value=X_train_age['Fare'].mean(), inplace=True)
ridge_age.fit(X_train_age, y_train_age)
age_pred = np.array(ridge_age.predict(X_needs_age))
age_pred[age_pred < 0] =0
df.loc[df['Age'].isnull(), ('Age')] = age_pred
In [78]:
def plot_confusion_matrix(cm, classes, normalize=False, title='Confusion Matrix', cmap=plt.cm.Blues):
"""
This function prints and plots the confusion matrix.
Normalization can be applied by setting `normalize=True`.
"""
plt.imshow(cm, interpolation='nearest', cmap=cmap)
plt.title(title)
plt.colorbar()
tick_marks = np.arange(len(classes))
plt.xticks(tick_marks, classes, rotation=45)
plt.yticks(tick_marks, classes)
if normalize:
cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
print("Normalized confusion matrix")
else:
print('Confusion matrix, without normalization')
print(cm)
thresh = cm.max() / 2.
for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
plt.text(j, i, cm[i, j],
horizontalalignment="center",
color="white" if cm[i, j] > thresh else "black")
plt.tight_layout()
plt.ylabel('True label')
plt.xlabel('Predicted label')
In [79]:
plt.hist(df['Age'])
Out[79]:
In [80]:
has_age.sum()
Out[80]:
In [81]:
plt.hist(df[(df['Survived'] == 0) & (df['has_age'] == 1)]['Age'], color='b', label='Died', bins=20)
plt.hist(df[(df['Survived'] == 1) & (df['has_age'] == 1)]['Age'], color='g', alpha=0.8, label='Survived', bins=20)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Number of people by age group')
plt.legend()
plt.show()
In [82]:
male_survivor_ages = df[(df['Survived'] == 1) & (df['has_age'] == 1) & (df['male'] == 1)]['Age']
female_survivor_ages = df[(df['Survived'] == 1) & (df['has_age'] == 1) & (df['female'] == 1)]['Age']
plt.hist(female_survivor_ages, color='b', label='Female', bins=20)
plt.hist(male_survivor_ages, color='g', label='Male', bins=20, alpha=.8)
plt.xlabel('Age')
plt.ylabel('Count')
plt.title('Survivors by age and gender')
plt.legend()
plt.show()
In [83]:
X_min = df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']]
y_min = X_min.pop('Survived')
In [84]:
df[['Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare']].describe()
Out[84]:
In [85]:
X_train_minimal, X_test_minimal, y_train_minimal, y_test_minimal = train_test_split(X_min, y_min)
logreg_min = LogisticRegressionCV(n_jobs=-1)
logreg_min.fit(X_train_minimal, y_train_minimal)
y_pred = logreg_min.predict(X_test_minimal)
cfn_matrix_minimal = confusion_matrix(y_test_minimal, y_pred)
np.set_printoptions(precision=2)
In [86]:
plt.figure()
plot_confusion_matrix(cfn_matrix_minimal, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
In [87]:
plt.figure()
plot_confusion_matrix(cfn_matrix_minimal, ['Died', 'Surv'], normalize=True, title="Cfn Matrix, no normalization")
In [88]:
print(classification_report(y_test_minimal, y_pred, target_names=['Died', 'Surv']))
In [89]:
logreg_min.score(X_test_minimal,y_test_minimal)
Out[89]:
In [90]:
logreg_min.score(X_min,y_min)
Out[90]:
In [91]:
precision_score(y_test_minimal, y_pred)
Out[91]:
In [92]:
recall_score(y_test_minimal, y_pred)
Out[92]:
In [107]:
X = df[df.corr().columns]
y = X.pop('Survived')
In [108]:
X.describe()
Out[108]:
In [111]:
X_train_features, X_test_features, y_train_features, y_test_features = train_test_split(X, y)
logreg = LogisticRegressionCV(n_jobs=-1)
logreg.fit(X_train_features, y_train_features)
y_pred = logreg.predict(X_test_features)
cfn_matrix_features = confusion_matrix(y_test_features, y_pred)
np.set_printoptions(precision=2)
In [112]:
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
In [99]:
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], normalize=True, title="Cfn Matrix, no normalization")
In [100]:
print(classification_report(y_test_features, y_pred, target_names=['Died', 'Surv']))
In [101]:
logreg.score(X, y)
Out[101]:
In [102]:
precision_score(y_test_features, y_pred)
Out[102]:
In [103]:
recall_score(y_test_features, y_pred)
Out[103]:
In [104]:
logreg.C_
Out[104]:
In [105]:
logreg.Cs_
Out[105]:
In [106]:
sum(y_test_features)/len(y_test_features)
Out[106]:
In [114]:
#X_train_features, X_test_features, y_train_features, y_test_features
In [115]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
In [116]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train_features, y_train_features)
Out[116]:
In [120]:
dtc_pred = dtc.predict(X_test_features)
In [122]:
cfn_matrix_features = confusion_matrix(y_test_features, dtc_pred)
In [121]:
cfn_matrix_features = confusion_matrix(y_test_features, dtc_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
In [145]:
from math import sqrt
In [155]:
rfc = RandomForestClassifier(n_estimators=1000, max_features=14, max_depth=5, n_jobs=-1)
rfc.fit(X_train_features, y_train_features)
Out[155]:
In [156]:
rfc_pred = rfc.predict(X_test_features)
In [157]:
cfn_matrix_features = confusion_matrix(y_test_features, rfc_pred)
np.set_printoptions(precision=2)
plt.figure()
plot_confusion_matrix(cfn_matrix_features, ['Died', 'Surv'], title="Cfn Matrix, no normalization")
In [197]:
df = pd.read_csv("data/test.csv", index_col=0)
# df['Deck'] = df[~df['Cabin'].isnull()]['Cabin'].str[0]
df['Title'] = df['Name'].apply(lambda x: x.split(',')[1].split('.')[0].strip())
df['FamSize'] = df['Parch'] + df['SibSp']
decks_df = df['Cabin'].str.extract('(?P<Deck>\w)(?P<CabinNumber>\d+)', expand=True)
deck_dummies = pd.get_dummies(decks_df['Deck'], prefix='Deck')
title_dummies = pd.get_dummies(df['Title'])
gender_dummies = pd.get_dummies(df['Sex'])
class_dummies = pd.get_dummies(df['Pclass'], prefix='Class')
embarked_dummies = pd.get_dummies(df['Embarked'])
has_age = df['Age'].notnull().astype('int')
has_age.name = 'has_age'
# saving to variables and using .concat() once seems to be much faster
df = pd.concat([df, decks_df, deck_dummies, title_dummies, gender_dummies, class_dummies,
embarked_dummies, has_age], axis=1)
In [199]:
X_train_age = df[df['Age'].notnull()][df.corr().columns]
X_needs_age = df[df['Age'].isnull()][df.corr().columns]
y_train_age = X_train_age.pop('Age')
X_needs_age = X_needs_age.drop('Age', axis=1)
ridge_age = RidgeCV()
X_train_age['Fare'].fillna(value=X_train_age['Fare'].mean(), inplace=True)
ridge_age.fit(X_train_age, y_train_age)
age_pred = np.array(ridge_age.predict(X_needs_age))
age_pred[age_pred < 0] =0
df.loc[df['Age'].isnull(), ('Age')] = age_pred
In [204]:
X_train_age.describe()
Out[204]:
In [205]:
X_train_age['Fare'].mean()
Out[205]:
In [206]:
df.describe()
Out[206]:
In [ ]: