In [334]:
import matplotlib.pyplot as plt
import scipy.stats as st
import seaborn as sns
import pandas as pd
import numpy as np
%matplotlib inline
train = pd.read_csv('train.csv', index_col='PassengerId')
test = pd.read_csv('test.csv', index_col='PassengerId')
In [3]:
train.head()
Out[3]:
In [4]:
train.info()
In [5]:
test.info()
In [362]:
plt.figure(1, figsize=(6, 6))
sns.barplot(x='Sex', y='Survived', data=train)
plt.show()
In [343]:
s_ages = train.loc[train['Survived'] == 1, 'Age'].dropna()
d_ages = train.loc[train['Survived'] == 0, 'Age'].dropna()
s_fares = train.loc[train['Survived'] == 1, 'Fare'].add(1).apply(np.log).dropna()
d_fares = train.loc[train['Survived'] == 0, 'Fare'].add(1).apply(np.log).dropna()
plt.figure(2, figsize=(12, 8))
plt.subplot(231)
sns.barplot(x='Pclass', y='Survived', data=train)
plt.subplot(234)
sns.barplot(x='Embarked', y='Survived', data=train)
plt.subplot(233)
sns.barplot(x='SibSp', y='Survived', data=train)
plt.subplot(236)
sns.barplot(x='Parch', y='Survived', data=train)
plt.subplot(232)
sns.distplot(d_ages, color='C0')
sns.distplot(s_ages, color='C1')
plt.subplot(235)
sns.distplot(d_fares, color='C0')
sns.distplot(s_fares, color='C1')
plt.show()
Here, we'll explore the features of the dataset. Since Sex
and PClass
are rather clear-cut and have been explored in many other kernels, we will not explore those for now. We'll explore the related features SibSp
and Parch
together, as a "family size" feature group. Due to the large amount of missing values for Age
, we will explore it last - after looking at the other features, we may come up with strategies for imputation.
Finally, we'll create several derived features if necessary.
In [351]:
train['Name'].head()
Out[351]:
The names look very consistently formatted, in the form of (last), (title). (first) (middle) Since there are only a handful of distinct titles (versus the largely unique names), we'll extract this information:
In [352]:
train['Title'] = train['Name'].str.extract('\,\s(.*?)[.]', expand=False)
print(train['Title'].unique())
In [353]:
test['Title'] = test['Name'].str.extract('\,\s(.*?)[.]', expand=False)
print(test['Title'].unique())
To start with, let's get an idea of how many passengers are holding each title.
In [366]:
plt.figure(3, figsize=(14, 4))
plt.subplot(121)
sns.countplot(train.loc[train['Sex'] == 'female', 'Title'])
plt.subplot(122)
sns.countplot(train.loc[train['Sex'] == 'male', 'Title'])
plt.show()
The low number of most of the titles suggest grouping up the more esoteric ones. We'll do so as follows (there are no hard rules, so we'll use some judgment):
For 'Ms.', we'll look at the woman's age, and also check her party.
In [121]:
train[train['Title'] == 'Ms']
Out[121]:
Since she is relatively young and traveling alone, we'll throw her in with the "Miss" group.
In [359]:
title_map = {'Mr': 'Mr',
'Mrs': 'Mrs',
'Miss': 'Miss',
'Master': 'Master',
'Dr': 'Dr',
'Rev': 'Rev',
'Don': 'mnoble',
'Sir': 'mnoble',
'Jonkheer': 'mnoble',
'Lady': 'fnoble',
'the Countess': 'fnoble',
'Dona': 'fnoble',
'Col': 'mil',
'Capt': 'mil',
'Major': 'mil',
'Mme': 'Mrs',
'Mlle': 'Miss',
'Ms': 'Miss'}
train['AdjTitle'] = train['Title'].map(title_map)
test['AdjTitle'] = test['Title'].map(title_map)
Let's see how these titles did:
In [365]:
plt.figure(4, figsize=(8, 4))
plt.subplot(121)
sns.barplot(x='AdjTitle', y='Survived', data=train[train['Sex'] == 'female'])
plt.subplot(122)
sns.barplot(x='AdjTitle', y='Survived', data=train[train['Sex'] == 'male'])
plt.show()
For women, it seems pretty clear-cut: The women with nobility titles survived (as did women on the whole). The men with titles (all except Rev.) seem to do better on average, but it's highly variable. Since the gender-based model where all women live and men die attains over a 76% accuracy, the hard part of our model seems to be picking out the few male survivors.
In [364]:
train['FamSize'] = train['SibSp'] + train['Parch']
test['FamSize'] = test['SibSp'] + test['Parch']
plt.figure(5, figsize=(8, 4))
plt.subplot(121)
sns.countplot(train.loc[train['Sex'] == 'female', 'FamSize'])
plt.subplot(122)
sns.countplot(train.loc[train['Sex'] == 'male', 'FamSize'])
plt.show()
In [367]:
train['FamSize'] = train['SibSp'] + train['Parch']
test['FamSize'] = test['SibSp'] + test['Parch']
plt.figure(6, figsize=(12, 8))
plt.subplot(231)
sns.countplot(train.loc[(train['Sex'] == 'female') & (train['Pclass'] == 1), 'FamSize'])
plt.subplot(234)
sns.countplot(train.loc[(train['Sex'] == 'male') & (train['Pclass'] == 1), 'FamSize'])
plt.subplot(232)
sns.countplot(train.loc[(train['Sex'] == 'female') & (train['Pclass'] == 2), 'FamSize'])
plt.subplot(235)
sns.countplot(train.loc[(train['Sex'] == 'male') & (train['Pclass'] == 2), 'FamSize'])
plt.subplot(233)
sns.countplot(train.loc[(train['Sex'] == 'female') & (train['Pclass'] == 3), 'FamSize'])
plt.subplot(236)
sns.countplot(train.loc[(train['Sex'] == 'male') & (train['Pclass'] == 3), 'FamSize'])
plt.show()
How did this impact survival?
In [368]:
plt.figure(7, figsize=(8, 4))
plt.subplot(121)
sns.barplot(x='FamSize', y='Survived', data=train[train['Sex'] == 'female'])
plt.subplot(122)
sns.barplot(x='FamSize', y='Survived', data=train[train['Sex'] == 'male'])
plt.show()
In [369]:
plt.figure(9, figsize=(12, 8))
plt.subplot(231)
sns.barplot(x='FamSize', y='Survived', data=train[(train['Sex'] == 'female') & (train['Pclass'] == 1)])
plt.subplot(234)
sns.barplot(x='FamSize', y='Survived', data=train[(train['Sex'] == 'male') & (train['Pclass'] == 1)])
plt.subplot(232)
sns.barplot(x='FamSize', y='Survived', data=train[(train['Sex'] == 'female') & (train['Pclass'] == 2)])
plt.subplot(235)
sns.barplot(x='FamSize', y='Survived', data=train[(train['Sex'] == 'male') & (train['Pclass'] == 2)])
plt.subplot(233)
sns.barplot(x='FamSize', y='Survived', data=train[(train['Sex'] == 'female') & (train['Pclass'] == 3)])
plt.subplot(236)
sns.barplot(x='FamSize', y='Survived', data=train[(train['Sex'] == 'male') & (train['Pclass'] == 3)])
plt.show()
Let's ignore (for the moment) possible effects from passenger class. We can then draw the following conclusions:
In [283]:
ticket_dupes = train[(train['Ticket'].duplicated(keep=False))].set_index('Ticket', append=True).swaplevel(0, 1).sort_index()
ticket_dupes
Out[283]:
We can check whether holders of duplicate tickets are likely to share cabins, fares, family size and embark location.
In [290]:
dupe_counts = ticket_dupes.reset_index().groupby('Ticket')[['Fare', 'Cabin', 'Embarked', 'FamSize']].nunique()
dupe_counts.describe()
Out[290]:
It seems like most of them did. Let's take a look at the fares:
In [304]:
ticket_dupes.loc[dupe_counts[dupe_counts['Fare'] > 1].index.values]
Out[304]:
Only one pair of fares that are different (and not by much). For all we know, this could be an entry error, but let's ignore this for now. Let's look at embark locations:
In [306]:
ticket_dupes.loc[dupe_counts[dupe_counts['Embarked'] > 1].index.values]
Out[306]:
Only two! Though these could be mistakes, it is plausible that they did board at different locations, since they do not appear related to each other. Let's look at the last two variables:
In [307]:
ticket_dupes.loc[dupe_counts[dupe_counts['FamSize'] > 1].index.values]
Out[307]:
In [308]:
ticket_dupes.loc[dupe_counts[dupe_counts['Cabin'] > 1].index.values]
Out[308]:
We have many more duplicate values here; it's plausible different families could split tickets, or bring servants/maids. Also, for family size, it's worth remembering that non-married partners do not count toward SibSp
.
In [315]:
plt.figure(10)
sns.distplot(train['Fare'])
plt.show()
st.skew(train['Fare'])
Out[315]:
Let's look at the outliers values that are above 200...
In [323]:
train[train['Fare'] > 200]
Out[323]:
We see that almost all of the fares have shared cabins and shared tickets. Let's test the theory that 'Fare' refers to a group fare of all tickets with the same number, rather than fare per ticket:
In [371]:
train['TicketSize'] = train['Ticket'].value_counts()[train['Ticket']].values
test['TicketSize'] = test['Ticket'].value_counts()[test['Ticket']].values
In [372]:
plt.figure(11, figsize=(12, 4))
plt.subplot(131)
sns.regplot(x='TicketSize', y='Fare', data=train[train['Pclass'] == 1])
plt.subplot(132)
sns.regplot(x='TicketSize', y='Fare', data=train[train['Pclass'] == 2])
plt.subplot(133)
sns.regplot(x='TicketSize', y='Fare', data=train[train['Pclass'] == 3])
plt.show()
Let's assume that it is linear. We'll divide by the ticket size, and look at the skew for each class:
In [373]:
train['AdjFare'] = train['Fare'].div(train['TicketSize'])
g = sns.FacetGrid(train, col='Pclass')
g = g.map(plt.hist, 'AdjFare')
plt.show()
train.groupby('Pclass')['AdjFare'].apply(st.skew)
Out[373]:
This is still somewhat right skewed. If we want, we can later use a square root transform; however, for now, we will leave the Fare as is.
In [11]:
train['CabinKnown'] = train['Cabin'].notnull()
pd.crosstab(train['CabinKnown'], train['Survived'])
Out[11]:
In [39]:
plt.figure(2)
sns.barplot(x='CabinKnown', y='Survived', data=train)
plt.show()
Let's also search for duplicate cabins, since that may indicate party size and help impute missing values.
In [14]:
train[(train['Cabin'].duplicated(keep=False)) & (train['Cabin'].notnull())].set_index('Cabin', append=True).swaplevel(0, 1).sort_index()
Out[14]:
In [329]:
train[train['Embarked'].isnull()]
Out[329]:
Let's go by the ticket number, since with the different patterns, it's likely ticket number format can match embarkment location. We'll scan all tickets that begin with '113' and see if there is a pattern:
In [333]:
train.loc[train['Ticket'].str.startswith('113'), 'Embarked'].value_counts()
Out[333]:
Seems like an overwhelming number of '113' tickets boarded at Southampton. 'S' it is!
In [212]:
fs_ages = train.loc[(train['Survived'] == 1) & (train['Sex'] == "female"), 'Age'].dropna()
fd_ages = train.loc[(train['Survived'] == 0) & (train['Sex'] == "female"), 'Age'].dropna()
ms_ages = train.loc[(train['Survived'] == 1) & (train['Sex'] == "male"), 'Age'].dropna()
md_ages = train.loc[(train['Survived'] == 0) & (train['Sex'] == "male"), 'Age'].dropna()
plt.figure(10, figsize=(9, 9))
plt.subplot(211)
sns.distplot(fs_ages, bins=range(81), kde=False, color='C1')
sns.distplot(fd_ages, bins=range(81), kde=False, color='C0', axlabel='Female Age')
plt.subplot(212)
sns.distplot(ms_ages, bins=range(81), kde=False, color='C1')
sns.distplot(md_ages, bins=range(81), kde=False, color='C0', axlabel='Male Age')
plt.show()
There's obviously a dichomotomy in both graphs: We see that teenaged or older males had a very poor survival rate compared with younger males. It seems back in the day, teenage boys were not considered "children."
For females, age seems to matter much less. There is a cutoff with about 50/50 survival rate (very young children dependent on others?) somewhere around 11 to 15.
To find a good cutoff point for "child" versus "adult," we can zoom our data in around ages 10-15.
In [148]:
train.loc[(train['Age'] < 15) & (train['Age'] > 10), ['Age', 'Survived', 'Sex']].sort_values(['Sex', 'Age'])
Out[148]:
We can set the cutoff at 12 or below to be considered a "child," and 13 or above to be considered an "adult." This will capture the border cases of two 13 year old girls surviving, and a 12 year old boy surviving.
In [149]:
train['Child'] = train['Age'] <= 12
In [203]:
train.loc[train['AdjTitle'] == 'Master', 'Age'].describe()
Out[203]:
In [204]:
train.loc[train['AdjTitle'] == 'Mr', 'Age'].describe()
Out[204]:
So any male with the title 'Master' is no more than 12! This puts him in the (luckier) basket of male children, increasing his surival odds. Similarly, most (but not all) males who are 'Mr' are above 12, making them less likely to be lucky.
In [198]:
train[train['Title'] == 'Miss']['Age'].describe()
Out[198]: