In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
In [43]:
import warnings
warnings.filterwarnings('ignore')
In [44]:
sns.set()
In [45]:
train_df = pd.read_csv('.\\data\\titanic_train.csv', index_col='PassengerId')
y = train_df.Survived
train_df = train_df.drop(columns=['Survived'])
In [46]:
test_df = pd.read_csv('.\\data\\titanic_test.csv', index_col='PassengerId')
In [47]:
df = pd.concat([train_df, test_df])
In [48]:
df.head(n=2)
Out[48]:
In [49]:
df.info()
In [50]:
# age ...
age_by_class = df.groupby('Pclass')['Age'].median()
df.loc[(df['Pclass'] == 1) & (df['Age'].isnull()), 'Age'] = age_by_class[1]
df.loc[(df['Pclass'] == 2) & (df['Age'].isnull()), 'Age'] = age_by_class[2]
df.loc[(df['Pclass'] == 3) & (df['Age'].isnull()), 'Age'] = age_by_class[3]
df['L_Age'] = np.log(df.Age.values)
df['C_Age'] = pd.cut(df.Age, 5)
df['IsYoung'] = df['Age'] < 16
df.IsYoung = df.IsYoung.astype(int)
# fare, set 0 to nan ...
df.loc[df.Fare == 0, 'Fare'] = np.nan
fare_by_class = df.groupby('Pclass')['Fare'].median()
df.loc[df['Pclass'] == 1, 'Fare'] = df[df['Pclass'] == 1].Fare.fillna(fare_by_class[1])
df.loc[df['Pclass'] == 2, 'Fare'] = df[df['Pclass'] == 2].Fare.fillna(fare_by_class[2])
df.loc[df['Pclass'] == 3, 'Fare'] = df[df['Pclass'] == 3].Fare.fillna(fare_by_class[3])
df['L_Fare'] = np.log(df.Fare.values)
df['C_Fare'] = pd.qcut(df.Fare, 4)
# title
df.Name = df.Name.astype(str)
df['Title'] = df.Name.str.split(',').map(lambda x: x[1].strip()).str.split('.').map(lambda x: x[0].strip())
# unmarried women
df['Title'] = df.Title.replace(['Mlle','Ms'], 'Miss')
# madame
df['Title'] = df.Title.replace(['Mme'], 'Mrs')
# masters
df['Title'] = df.Title.replace(['Jonkheer'], 'Master')
# other men
df['Title'] = df.Title.replace(['Capt', 'Don', 'Major', 'Col', 'Sir', 'Dr', 'Rev'], 'Other_Men')
# other women
df['Title'] = df.Title.replace(['Lady', 'the Countess', 'Dona'], 'Other_Women')
# encoding, just for fun
df['C_Title'] = df.Title.map({ 'Mr': 1, 'Other_Men': 2, 'Master': 3, 'Miss': 4, 'Mrs': 5, 'Other_Women': 6 })
# embarked
df.Embarked = df.Embarked.fillna('S')
# size
df['Size'] = df.SibSp + df.Parch + 1
# is alone
df['IsAlone'] = df.Size == 1
df.IsAlone = df.IsAlone.astype(int)
# categories
df.Title = df.Title.astype('category')
df.Pclass = df.Pclass.astype(int)
df.Sex = df.Sex == 'male'
df.Sex = df.Sex.astype(int)
df.Embarked = df.Embarked.astype('category')
In [51]:
df = pd.concat([df, pd.get_dummies(df.Embarked, prefix='Embarked')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Title, prefix='Title')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Pclass, prefix='Pclass')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Sex, prefix='Sex')], axis=1)
df = pd.concat([df, pd.get_dummies(df.C_Age, prefix='CategoryAge')], axis=1)
df = pd.concat([df, pd.get_dummies(df.C_Fare, prefix='CategoryFare')], axis=1)
In [52]:
df.info()
In [53]:
df.head(n=3)
Out[53]:
In [54]:
X = df.iloc[:len(y)]
Xt = df.iloc[len(y):]
XC = pd.concat([X,y], axis=1)
X = X.drop(columns=['Name', 'Ticket', 'Cabin', 'Title', 'C_Age', 'C_Fare', 'Embarked', 'Age', 'Fare', 'Pclass'])
Xt = Xt.drop(columns=['Name', 'Ticket', 'Cabin', 'Title', 'C_Age', 'C_Fare', 'Embarked', 'Age', 'Fare', 'Pclass'])
In [55]:
# survival vs not ...
_ = sns.countplot(x='Survived', data=XC)
plt.show()
In [56]:
# survival by sex ...
_ = sns.countplot('Sex', hue='Survived', data=XC)
plt.show()
In [57]:
# survival by title ...
_ = sns.countplot('Pclass', hue='Survived', data=XC)
plt.show()
In [58]:
# survival by embarked ...
_ = sns.countplot('Embarked', hue='Survived', data=XC)
plt.show()
In [59]:
# embarked by title ...
_ = sns.countplot('Embarked', hue='Title', data=XC)
plt.show()
In [60]:
# survival by title ...
_ = sns.countplot('Title', hue='Survived', data=XC)
plt.show()
In [61]:
# survival by family size ...
_ = sns.countplot('Size', hue='Survived', data=XC)
plt.show()
In [62]:
# survival by family size ...
_ = sns.countplot('Size', hue='Title', data=XC)
plt.show()
In [63]:
# survival, if you have elders ...
_ = sns.countplot('Parch', hue='Survived', data=XC)
plt.show()
In [64]:
# how those with siblings survived ...
_ = sns.countplot('SibSp', hue='Survived', data=XC)
plt.show()
In [65]:
# 16 and under vs others survived ...
_ = sns.countplot('IsYoung', hue='Survived', data=XC)
plt.show()
In [66]:
# fare ...
_ = plt.subplot(2,1,1)
_ = sns.countplot('C_Fare', hue='Survived', data=XC)
_ = plt.subplot(2,1,2)
_ = sns.countplot('C_Fare', hue='Pclass', data=XC)
plt.tight_layout()
plt.show()
In [67]:
# age ...
_ = plt.subplot(2,1,1)
_ = sns.countplot('C_Age', hue='Survived', data=XC)
_ = plt.subplot(2,1,2)
_ = sns.countplot('C_Age', hue='Pclass', data=XC)
plt.tight_layout()
plt.show()
In [68]:
_ = plt.hist(XC.loc[XC['Survived'] == 0, 'Age'], alpha=.5, color='red', label='Died')
_ = plt.hist(XC.loc[XC['Survived'] == 1, 'Age'], alpha=.5, color='black', label='Survived')
_ = plt.legend()
plt.show()
In [69]:
# age
_ = plt.hist(XC[XC['Survived'] == 0].Age, alpha=.5, label='Died')
_ = plt.hist(XC[XC['Survived'] == 1].Age, alpha=.5, label='Survived')
_ = plt.legend()
plt.tight_layout()
plt.show()
In [70]:
np.percentile(XC.Age.values, [0, .25, .5, .75, 100])
Out[70]:
In [71]:
from sklearn.cross_validation import KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA
In [72]:
# run PCA ...
In [73]:
pca = PCA()
pca.fit(X)
features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()
X.columns[:]
In [88]:
columns_to_use = X.columns[:10]
In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=11)
In [90]:
grid = {
'C': np.power(10.0, np.arange(-10, 10))
}
# scoring: area under the ROC.
estimator = GridSearchCV(LogisticRegression(max_iter=10000), grid, cv=5, scoring='roc_auc')
estimator.fit(X_train[columns_to_use], y_train)
reg = estimator.best_estimator_
print('best scores:', estimator.best_score_)
In [91]:
proba = reg.predict_proba(X_test[columns_to_use])[:, 1]
pred = reg.predict(X_test[columns_to_use])
_ = plt.plot(proba, y_test, marker='.', linestyle='none', color='blue', alpha=.45)
_ = plt.axvline(.5, color='red', alpha=.5)
_ = plt.xlabel('proba')
_ = plt.ylabel('actual')
plt.show()
In [92]:
from sklearn.metrics import roc_curve, auc
In [93]:
fpr, tpr, _ = roc_curve(y_test, proba)
_ = plt.plot(fpr, tpr, label='auc = ' + str(auc(fpr,tpr)))
_ = plt.plot([0,1],[0,1], linestyle='--')
_ = plt.legend(loc='lower right')
plt.show()
In [94]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
In [95]:
print(confusion_matrix(pred, y_test))
In [84]:
print(classification_report(pred, y_test))
In [85]:
predictions = reg.predict(Xt[columns_to_use])
In [86]:
output = pd.DataFrame({
'PassengerId': Xt.index,
'Survived': predictions
}).set_index('PassengerId')
#output.to_csv('.\\preds.csv')
In [87]:
output.head(n=10)
Out[87]:
In [ ]: