In [42]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [43]:
import warnings
warnings.filterwarnings('ignore')

In [44]:
sns.set()

In [45]:
train_df = pd.read_csv('.\\data\\titanic_train.csv', index_col='PassengerId')

y = train_df.Survived
train_df = train_df.drop(columns=['Survived'])

In [46]:
test_df = pd.read_csv('.\\data\\titanic_test.csv', index_col='PassengerId')

In [47]:
df = pd.concat([train_df, test_df])

In [48]:
df.head(n=2)


Out[48]:
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
PassengerId
1 3 Braund, Mr. Owen Harris male 22.0 1 0 A/5 21171 7.2500 NaN S
2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 0 PC 17599 71.2833 C85 C

In [49]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 10 columns):
Pclass      1309 non-null int64
Name        1309 non-null object
Sex         1309 non-null object
Age         1046 non-null float64
SibSp       1309 non-null int64
Parch       1309 non-null int64
Ticket      1309 non-null object
Fare        1308 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
dtypes: float64(2), int64(3), object(5)
memory usage: 112.5+ KB

In [50]:
# age ...
age_by_class = df.groupby('Pclass')['Age'].median()
df.loc[(df['Pclass'] == 1) & (df['Age'].isnull()), 'Age'] = age_by_class[1]
df.loc[(df['Pclass'] == 2) & (df['Age'].isnull()), 'Age'] = age_by_class[2]
df.loc[(df['Pclass'] == 3) & (df['Age'].isnull()), 'Age'] = age_by_class[3]

df['L_Age'] = np.log(df.Age.values)
df['C_Age'] = pd.cut(df.Age, 5)

df['IsYoung'] = df['Age'] < 16
df.IsYoung = df.IsYoung.astype(int)

# fare, set 0 to nan ...
df.loc[df.Fare == 0, 'Fare'] = np.nan

fare_by_class = df.groupby('Pclass')['Fare'].median()
df.loc[df['Pclass'] == 1, 'Fare'] = df[df['Pclass'] == 1].Fare.fillna(fare_by_class[1])
df.loc[df['Pclass'] == 2, 'Fare'] = df[df['Pclass'] == 2].Fare.fillna(fare_by_class[2])
df.loc[df['Pclass'] == 3, 'Fare'] = df[df['Pclass'] == 3].Fare.fillna(fare_by_class[3])

df['L_Fare'] = np.log(df.Fare.values)
df['C_Fare'] = pd.qcut(df.Fare, 4)

# title
df.Name = df.Name.astype(str)
df['Title'] = df.Name.str.split(',').map(lambda x: x[1].strip()).str.split('.').map(lambda x: x[0].strip())

# unmarried women
df['Title'] = df.Title.replace(['Mlle','Ms'], 'Miss')
# madame
df['Title'] = df.Title.replace(['Mme'], 'Mrs')
# masters
df['Title'] = df.Title.replace(['Jonkheer'], 'Master')
# other men
df['Title'] = df.Title.replace(['Capt', 'Don', 'Major', 'Col', 'Sir', 'Dr', 'Rev'], 'Other_Men')
# other women
df['Title'] = df.Title.replace(['Lady', 'the Countess', 'Dona'], 'Other_Women')
# encoding, just for fun
df['C_Title'] = df.Title.map({ 'Mr': 1, 'Other_Men': 2, 'Master': 3, 'Miss': 4, 'Mrs': 5, 'Other_Women': 6 })

# embarked
df.Embarked = df.Embarked.fillna('S')

# size
df['Size'] = df.SibSp + df.Parch + 1

# is alone
df['IsAlone'] = df.Size == 1
df.IsAlone = df.IsAlone.astype(int)

# categories
df.Title = df.Title.astype('category')
df.Pclass = df.Pclass.astype(int)
df.Sex = df.Sex == 'male'
df.Sex = df.Sex.astype(int)
df.Embarked = df.Embarked.astype('category')

In [51]:
df = pd.concat([df, pd.get_dummies(df.Embarked, prefix='Embarked')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Title, prefix='Title')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Pclass, prefix='Pclass')], axis=1)
df = pd.concat([df, pd.get_dummies(df.Sex, prefix='Sex')], axis=1)
df = pd.concat([df, pd.get_dummies(df.C_Age, prefix='CategoryAge')], axis=1)
df = pd.concat([df, pd.get_dummies(df.C_Fare, prefix='CategoryFare')], axis=1)

In [52]:
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 42 columns):
Pclass                            1309 non-null int32
Name                              1309 non-null object
Sex                               1309 non-null int32
Age                               1309 non-null float64
SibSp                             1309 non-null int64
Parch                             1309 non-null int64
Ticket                            1309 non-null object
Fare                              1309 non-null float64
Cabin                             295 non-null object
Embarked                          1309 non-null category
L_Age                             1309 non-null float64
C_Age                             1309 non-null category
IsYoung                           1309 non-null int32
L_Fare                            1309 non-null float64
C_Fare                            1309 non-null category
Title                             1309 non-null category
C_Title                           1309 non-null int64
Size                              1309 non-null int64
IsAlone                           1309 non-null int32
Embarked_C                        1309 non-null uint8
Embarked_Q                        1309 non-null uint8
Embarked_S                        1309 non-null uint8
Title_Master                      1309 non-null uint8
Title_Miss                        1309 non-null uint8
Title_Mr                          1309 non-null uint8
Title_Mrs                         1309 non-null uint8
Title_Other_Men                   1309 non-null uint8
Title_Other_Women                 1309 non-null uint8
Pclass_1                          1309 non-null uint8
Pclass_2                          1309 non-null uint8
Pclass_3                          1309 non-null uint8
Sex_0                             1309 non-null uint8
Sex_1                             1309 non-null uint8
CategoryAge_(0.0902, 16.136]      1309 non-null uint8
CategoryAge_(16.136, 32.102]      1309 non-null uint8
CategoryAge_(32.102, 48.068]      1309 non-null uint8
CategoryAge_(48.068, 64.034]      1309 non-null uint8
CategoryAge_(64.034, 80.0]        1309 non-null uint8
CategoryFare_(3.17, 7.925]        1309 non-null uint8
CategoryFare_(7.925, 14.5]        1309 non-null uint8
CategoryFare_(14.5, 31.388]       1309 non-null uint8
CategoryFare_(31.388, 512.329]    1309 non-null uint8
dtypes: category(4), float64(4), int32(4), int64(4), object(3), uint8(23)
memory usage: 178.1+ KB

In [53]:
df.head(n=3)


Out[53]:
Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked ... Sex_1 CategoryAge_(0.0902, 16.136] CategoryAge_(16.136, 32.102] CategoryAge_(32.102, 48.068] CategoryAge_(48.068, 64.034] CategoryAge_(64.034, 80.0] CategoryFare_(3.17, 7.925] CategoryFare_(7.925, 14.5] CategoryFare_(14.5, 31.388] CategoryFare_(31.388, 512.329]
PassengerId
1 3 Braund, Mr. Owen Harris 1 22.0 1 0 A/5 21171 7.2500 NaN S ... 1 0 1 0 0 0 1 0 0 0
2 1 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 38.0 1 0 PC 17599 71.2833 C85 C ... 0 0 0 1 0 0 0 0 0 1
3 3 Heikkinen, Miss. Laina 0 26.0 0 0 STON/O2. 3101282 7.9250 NaN S ... 0 0 1 0 0 0 1 0 0 0

3 rows × 42 columns


In [54]:
X = df.iloc[:len(y)]
Xt = df.iloc[len(y):]
XC = pd.concat([X,y], axis=1)

X = X.drop(columns=['Name', 'Ticket', 'Cabin', 'Title', 'C_Age', 'C_Fare', 'Embarked', 'Age', 'Fare', 'Pclass'])
Xt = Xt.drop(columns=['Name', 'Ticket', 'Cabin', 'Title', 'C_Age', 'C_Fare', 'Embarked', 'Age', 'Fare', 'Pclass'])

In [55]:
# survival vs not ...
_ = sns.countplot(x='Survived', data=XC)
plt.show()



In [56]:
# survival by sex ...
_ = sns.countplot('Sex', hue='Survived', data=XC)
plt.show()



In [57]:
# survival by title ...
_ = sns.countplot('Pclass', hue='Survived', data=XC)
plt.show()



In [58]:
# survival by embarked ...
_ = sns.countplot('Embarked', hue='Survived', data=XC)
plt.show()



In [59]:
# embarked by title ...
_ = sns.countplot('Embarked', hue='Title', data=XC)
plt.show()



In [60]:
# survival by title ...
_ = sns.countplot('Title', hue='Survived', data=XC)
plt.show()



In [61]:
# survival by family size ...
_ = sns.countplot('Size', hue='Survived', data=XC)
plt.show()



In [62]:
# survival by family size ...
_ = sns.countplot('Size', hue='Title', data=XC)
plt.show()



In [63]:
# survival, if you have elders ...
_ = sns.countplot('Parch', hue='Survived', data=XC)
plt.show()



In [64]:
# how those with siblings survived ...
_ = sns.countplot('SibSp', hue='Survived', data=XC)
plt.show()



In [65]:
# 16 and under vs others survived ...
_ = sns.countplot('IsYoung', hue='Survived', data=XC)
plt.show()



In [66]:
# fare ...
_ = plt.subplot(2,1,1)
_ = sns.countplot('C_Fare', hue='Survived', data=XC)

_ = plt.subplot(2,1,2)
_ = sns.countplot('C_Fare', hue='Pclass', data=XC)

plt.tight_layout()
plt.show()



In [67]:
# age ...
_ = plt.subplot(2,1,1)
_ = sns.countplot('C_Age', hue='Survived', data=XC)

_ = plt.subplot(2,1,2)
_ = sns.countplot('C_Age', hue='Pclass', data=XC)

plt.tight_layout()
plt.show()



In [68]:
_ = plt.hist(XC.loc[XC['Survived'] == 0, 'Age'], alpha=.5, color='red', label='Died')
_ = plt.hist(XC.loc[XC['Survived'] == 1, 'Age'], alpha=.5, color='black', label='Survived')
_ = plt.legend()

plt.show()



In [69]:
# age
_ = plt.hist(XC[XC['Survived'] == 0].Age, alpha=.5, label='Died')
_ = plt.hist(XC[XC['Survived'] == 1].Age, alpha=.5, label='Survived')
_ = plt.legend()

plt.tight_layout()
plt.show()



In [70]:
np.percentile(XC.Age.values, [0, .25, .5, .75, 100])


Out[70]:
array([  0.42 ,   0.75 ,   0.83 ,   0.974,  80.   ])

In [71]:
from sklearn.cross_validation import KFold, train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA

In [72]:
# run PCA ...

In [73]:
pca = PCA()
pca.fit(X)

features = range(pca.n_components_)
plt.bar(features, pca.explained_variance_)
plt.xlabel('PCA feature')
plt.ylabel('variance')
plt.xticks(features)
plt.show()

X.columns[:]



In [88]:
columns_to_use = X.columns[:10]

In [89]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, random_state=11)

In [90]:
grid = {
    'C': np.power(10.0, np.arange(-10, 10))
}

# scoring: area under the ROC.
estimator = GridSearchCV(LogisticRegression(max_iter=10000), grid, cv=5, scoring='roc_auc')
estimator.fit(X_train[columns_to_use], y_train)

reg = estimator.best_estimator_
print('best scores:', estimator.best_score_)


best scores: 0.8526296762251818

In [91]:
proba = reg.predict_proba(X_test[columns_to_use])[:, 1]
pred = reg.predict(X_test[columns_to_use])

_ = plt.plot(proba, y_test, marker='.', linestyle='none', color='blue', alpha=.45)
_ = plt.axvline(.5, color='red', alpha=.5)
_ = plt.xlabel('proba')
_ = plt.ylabel('actual')

plt.show()



In [92]:
from sklearn.metrics import roc_curve, auc

In [93]:
fpr, tpr, _ = roc_curve(y_test, proba)

_ = plt.plot(fpr, tpr, label='auc = ' + str(auc(fpr,tpr)))
_ = plt.plot([0,1],[0,1], linestyle='--')
_ = plt.legend(loc='lower right')

plt.show()



In [94]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [95]:
print(confusion_matrix(pred, y_test))


[[161  23]
 [ 15  69]]

In [84]:
print(classification_report(pred, y_test))


             precision    recall  f1-score   support

          0       0.91      0.88      0.89       184
          1       0.75      0.82      0.78        84

avg / total       0.86      0.86      0.86       268


In [85]:
predictions = reg.predict(Xt[columns_to_use])

In [86]:
output = pd.DataFrame({
    'PassengerId': Xt.index,
    'Survived': predictions
}).set_index('PassengerId')

#output.to_csv('.\\preds.csv')

In [87]:
output.head(n=10)


Out[87]:
Survived
PassengerId
892 0
893 1
894 0
895 0
896 1
897 0
898 1
899 0
900 1
901 0

In [ ]: