Kaggle Titanic competition

https://www.kaggle.com/c/titanic


In [5]:
#Import pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import re

#Render matplotlibs inside the notebook
%matplotlib inline

#Change default matplotlib style
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)

In [6]:
#Load train and test data
df = pd.read_csv("raw_data/train.csv")
df['dataset'] = 'train'
test = pd.read_csv("raw_data/test.csv")
test['dataset'] = 'test'
#Merge dataframes (so we apply the same transformations to both datasets)
df = df.append(test, ignore_index=True)

In [7]:
#Assign passenger id as index column
df.index = df['PassengerId']
#Drop passenger id column
df.drop(['PassengerId'], axis=1, inplace=True)

In [8]:
#Print columns information
df.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 12 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    891 non-null float64
Ticket      1309 non-null object
dataset     1309 non-null object
dtypes: float64(3), int64(3), object(6)
memory usage: 132.9+ KB

In [9]:
#Take a look at the data
df.head()


Out[9]:
Age Cabin Embarked Fare Name Parch Pclass Sex SibSp Survived Ticket dataset
PassengerId
1 22 NaN S 7.2500 Braund, Mr. Owen Harris 0 3 male 1 0 A/5 21171 train
2 38 C85 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 1 female 1 1 PC 17599 train
3 26 NaN S 7.9250 Heikkinen, Miss. Laina 0 3 female 0 1 STON/O2. 3101282 train
4 35 C123 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 1 female 1 1 113803 train
5 35 NaN S 8.0500 Allen, Mr. William Henry 0 3 male 0 0 373450 train

Cabin


In [10]:
df.Cabin[df.Cabin.isnull()] = 'U'
df['deck'] =  df.Cabin.map(lambda x: x[0])
df.drop(['Cabin'], axis=1, inplace=True)


/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

Ticket


In [11]:
#ticket_prefix = df.Ticket.map(lambda x: x[:2])
#This isn't necessary
#df['ticket_prefix'] = ticket_prefix
#pd.crosstab(ticket_prefix, df.Survived)

Name


In [12]:
#Let's take a look at some of the names
df.Name.head(4)


Out[12]:
PassengerId
1                              Braund, Mr. Owen Harris
2    Cumings, Mrs. John Bradley (Florence Briggs Th...
3                               Heikkinen, Miss. Laina
4         Futrelle, Mrs. Jacques Heath (Lily May Peel)
Name: Name, dtype: object

Each name has a title (like Mrs or Mr), maybe that helps to predict survival.


In [13]:
#For each name, extract the title
name_title = df.Name.map(lambda name: re.search('.*,{1}\s{1}([a-zA-Z\s]+)\.{1}.*', name).group(1))
df['name_title'] = name_title
#Create a table to compare it with survival
np_tab = pd.crosstab(name_title, df.Survived)
np_tab


Out[13]:
Survived 0.0 1.0
Name
Capt 1 0
Col 1 1
Don 1 0
Dr 4 3
Jonkheer 1 0
Lady 0 1
Major 1 1
Master 17 23
Miss 55 127
Mlle 0 2
Mme 0 1
Mr 436 81
Mrs 26 99
Ms 0 1
Rev 6 0
Sir 0 1
the Countess 0 1

Seems like being a Mr does not help to survive (that's a proxy for 2-class male), on the other side being a Miss or Mrs helps a lot. Let's compute some features using this new information.


In [14]:
set(name_title)


Out[14]:
{'Capt',
 'Col',
 'Don',
 'Dona',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}

In [15]:
'''is_man = name_title.isin(['Capt', 'Don', 'Rev', 'Mr', 'Dr', 'Col', 'Major', 'Master', 'Ms'])
is_woman = name_title.isin(['Miss', 'Mrs', 'Dona'])
is_sir = name_title.isin(['Sir'])
is_lady = name_title.isin(['Jonkheer', 'Mme', 'Lady', 'Mlle', 'the Countess'])

name_title[is_man] = 'man'
name_title[is_woman] = 'woman'
name_title[is_sir] = 'sir'
name_title[is_lady] = 'lady'
df['name_title'] = name_title'''


Out[15]:
"is_man = name_title.isin(['Capt', 'Don', 'Rev', 'Mr', 'Dr', 'Col', 'Major', 'Master', 'Ms'])\nis_woman = name_title.isin(['Miss', 'Mrs', 'Dona'])\nis_sir = name_title.isin(['Sir'])\nis_lady = name_title.isin(['Jonkheer', 'Mme', 'Lady', 'Mlle', 'the Countess'])\n\nname_title[is_man] = 'man'\nname_title[is_woman] = 'woman'\nname_title[is_sir] = 'sir'\nname_title[is_lady] = 'lady'\ndf['name_title'] = name_title"

In [16]:
sums =  np_tab.apply(lambda row: row[0]+row[1], axis=1)
n_passengers = df.shape[0]
np_tab['percentage_not_survived'] = np_tab.loc[:,0]*100/sums
np_tab['percentage_survived'] = np_tab.iloc[:,1]*100/sums
#np_tab.drop(np_tab.columns[[0,1]], axis=1, inplace=True)
np_tab = np_tab.sort(['percentage_survived'])
np_tab


/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)
Out[16]:
Survived 0.0 1.0 percentage_not_survived percentage_survived
Name
Capt 1 0 100.000000 0.000000
Don 1 0 100.000000 0.000000
Jonkheer 1 0 100.000000 0.000000
Rev 6 0 100.000000 0.000000
Mr 436 81 84.332689 15.667311
Dr 4 3 57.142857 42.857143
Col 1 1 50.000000 50.000000
Major 1 1 50.000000 50.000000
Master 17 23 42.500000 57.500000
Miss 55 127 30.219780 69.780220
Mrs 26 99 20.800000 79.200000
Mme 0 1 0.000000 100.000000
Sir 0 1 0.000000 100.000000
Ms 0 1 0.000000 100.000000
Lady 0 1 0.000000 100.000000
Mlle 0 2 0.000000 100.000000
the Countess 0 1 0.000000 100.000000

Fare


In [17]:
df[df.Fare.isnull()]


Out[17]:
Age Embarked Fare Name Parch Pclass Sex SibSp Survived Ticket dataset deck name_title
PassengerId
1044 60.5 S NaN Storey, Mr. Thomas 0 3 male 0 NaN 3701 test U Mr

In [18]:
df.boxplot(column='Fare', by='Pclass')


Out[18]:
<matplotlib.axes._subplots.AxesSubplot at 0x1078b8810>

In [19]:
df.loc[1044,'Fare'] = df[df.Pclass==3]['Fare'].median()

Age


In [20]:
df[df.Fare.isnull()]['Fare'] = 1

In [21]:
df.Age.describe()


Out[21]:
count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: Age, dtype: float64

In [22]:
no_age = df[df.Age.isnull()]
no_age.shape


Out[22]:
(263, 13)

In [23]:
df.boxplot(column='Age', by='Pclass')


Out[23]:
<matplotlib.axes._subplots.AxesSubplot at 0x107837410>

In [24]:
df.boxplot(column='Age', by='Sex')


Out[24]:
<matplotlib.axes._subplots.AxesSubplot at 0x107e0b310>

Let's use Pclass to estimate the age, using the median por each Pclass


In [25]:
median_ages = df[['Pclass','Age','Sex']].groupby(['Pclass','Sex']).median()
median_ages


Out[25]:
Age
Pclass Sex
1 female 36.0
male 42.0
2 female 28.0
male 29.5
3 female 22.0
male 25.0

In [26]:
def estimate_age(row):
    if pd.isnull(row.Age):
        return float(median_ages.ix[row.Pclass].ix[row.Sex])
    return row.Age

In [27]:
df['EstimatedAge'] = df.apply(estimate_age, axis=1)
df.drop('Age', axis=1, inplace=True)

In [28]:
df.head()


Out[28]:
Embarked Fare Name Parch Pclass Sex SibSp Survived Ticket dataset deck name_title EstimatedAge
PassengerId
1 S 7.2500 Braund, Mr. Owen Harris 0 3 male 1 0 A/5 21171 train U Mr 22
2 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 1 female 1 1 PC 17599 train C Mrs 38
3 S 7.9250 Heikkinen, Miss. Laina 0 3 female 0 1 STON/O2. 3101282 train U Miss 26
4 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 1 female 1 1 113803 train C Mrs 35
5 S 8.0500 Allen, Mr. William Henry 0 3 male 0 0 373450 train U Mr 35

Embarked


In [29]:
df[df.Embarked.isnull()]


Out[29]:
Embarked Fare Name Parch Pclass Sex SibSp Survived Ticket dataset deck name_title EstimatedAge
PassengerId
62 NaN 80 Icard, Miss. Amelie 0 1 female 0 1 113572 train B Miss 38
830 NaN 80 Stone, Mrs. George Nelson (Martha Evelyn) 0 1 female 0 1 113572 train B Mrs 62

In [30]:
pd.crosstab(df.Embarked, df.Pclass)


Out[30]:
Pclass 1 2 3
Embarked
C 141 28 101
Q 3 7 113
S 177 242 495

In [31]:
pd.crosstab(df.Embarked, df.Survived)


Out[31]:
Survived 0.0 1.0
Embarked
C 75 93
Q 47 30
S 427 217

In [32]:
df.loc[61,'Embarked'] = 'S'
df.loc[829,'Embarked'] = 'S'

SibSp and Parch


In [33]:
df['FamSize']  = df.SibSp + df.Parch
df[['FamSize', 'Survived']][df.dataset=='train'].groupby('FamSize').count()


Out[33]:
Survived
FamSize
0 537
1 161
2 102
3 29
4 15
5 22
6 12
7 6
10 7

Sex


In [34]:
df[['Sex', 'Survived']][df.dataset=='train'].groupby('Sex').count()


Out[34]:
Survived
Sex
female 314
male 577

Interaction features - Fare/Age


In [35]:
#df['fare_over_age'] = df['Fare']/df['EstimatedAge']

Generate clean datasets


In [36]:
#Encode sex as dummies
sex_dummies = pd.get_dummies(df['Sex'])
df = df.join(sex_dummies)
#Encode embarked as a categorical variable
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='embarked')
df = df.join(embarked_dummies)
#Encode name_title as dummies
name_title_dummies = pd.get_dummies(df['name_title'])
df = df.join(name_title_dummies)
#Encode deck as dummies
deck_dummies = pd.get_dummies(df['deck'], prefix='deck')
df = df.join(deck_dummies)

In [37]:
df.head()


Out[37]:
Embarked Fare Name Parch Pclass Sex SibSp Survived Ticket dataset ... the Countess deck_A deck_B deck_C deck_D deck_E deck_F deck_G deck_T deck_U
PassengerId
1 S 7.2500 Braund, Mr. Owen Harris 0 3 male 1 0 A/5 21171 train ... 0 0 0 0 0 0 0 0 0 1
2 C 71.2833 Cumings, Mrs. John Bradley (Florence Briggs Th... 0 1 female 1 1 PC 17599 train ... 0 0 0 1 0 0 0 0 0 0
3 S 7.9250 Heikkinen, Miss. Laina 0 3 female 0 1 STON/O2. 3101282 train ... 0 0 0 0 0 0 0 0 0 1
4 S 53.1000 Futrelle, Mrs. Jacques Heath (Lily May Peel) 0 1 female 1 1 113803 train ... 0 0 0 1 0 0 0 0 0 0
5 S 8.0500 Allen, Mr. William Henry 0 3 male 0 0 373450 train ... 0 0 0 0 0 0 0 0 0 1

5 rows × 46 columns


In [38]:
#Drop unnecessary variables
df.drop('Embarked', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.drop('Sex', axis=1, inplace=True)
df.drop('name_title', axis=1, inplace=True)
df.drop('deck', axis=1, inplace=True)

In [39]:
df.head()


Out[39]:
Fare Parch Pclass SibSp Survived dataset EstimatedAge FamSize female male ... the Countess deck_A deck_B deck_C deck_D deck_E deck_F deck_G deck_T deck_U
PassengerId
1 7.2500 0 3 1 0 train 22 1 0 1 ... 0 0 0 0 0 0 0 0 0 1
2 71.2833 0 1 1 1 train 38 1 1 0 ... 0 0 0 1 0 0 0 0 0 0
3 7.9250 0 3 0 1 train 26 0 1 0 ... 0 0 0 0 0 0 0 0 0 1
4 53.1000 0 1 1 1 train 35 1 1 0 ... 0 0 0 1 0 0 0 0 0 0
5 8.0500 0 3 0 0 train 35 0 0 1 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 40 columns


In [40]:
train = df[df['dataset']=='train']
test = df[df['dataset']=='test']

In [41]:
train.drop('dataset', axis=1, inplace=True)
test.drop(['dataset','Survived'], axis=1, inplace=True)


/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app

In [42]:
train.to_csv("train_clean.csv", index_label='PassengerId')
test.to_csv("test_clean.csv", index_label='PassengerId')

In [ ]: