Kaggle Titanic competition

https://www.kaggle.com/c/titanic



In [5]:

    
#Import pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import re

#Render matplotlibs inside the notebook
%matplotlib inline

#Change default matplotlib style
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)



In [6]:

    
#Load train and test data
df = pd.read_csv("raw_data/train.csv")
df['dataset'] = 'train'
test = pd.read_csv("raw_data/test.csv")
test['dataset'] = 'test'
#Merge dataframes (so we apply the same transformations to both datasets)
df = df.append(test, ignore_index=True)



In [7]:

    
#Assign passenger id as index column
df.index = df['PassengerId']
#Drop passenger id column
df.drop(['PassengerId'], axis=1, inplace=True)



In [8]:

    
#Print columns information
df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 1 to 1309
Data columns (total 12 columns):
Age         1046 non-null float64
Cabin       295 non-null object
Embarked    1307 non-null object
Fare        1308 non-null float64
Name        1309 non-null object
Parch       1309 non-null int64
Pclass      1309 non-null int64
Sex         1309 non-null object
SibSp       1309 non-null int64
Survived    891 non-null float64
Ticket      1309 non-null object
dataset     1309 non-null object
dtypes: float64(3), int64(3), object(6)
memory usage: 132.9+ KB



In [9]:

    
#Take a look at the data
df.head()









    Out[9]:






  
    
      
      Age
      Cabin
      Embarked
      Fare
      Name
      Parch
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      dataset
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      22
      NaN
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      3
      male
      1
      0
      A/5 21171
      train
    
    
      2
      38
      C85
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      1
      female
      1
      1
      PC 17599
      train
    
    
      3
      26
      NaN
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      female
      0
      1
      STON/O2. 3101282
      train
    
    
      4
      35
      C123
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      1
      female
      1
      1
      113803
      train
    
    
      5
      35
      NaN
      S
      8.0500
      Allen, Mr. William Henry
      0
      3
      male
      0
      0
      373450
      train

Cabin

https://www.kaggle.com/c/titanic/forums/t/4693/is-cabin-an-important-predictor/25690



In [10]:

    
df.Cabin[df.Cabin.isnull()] = 'U'
df['deck'] =  df.Cabin.map(lambda x: x[0])
df.drop(['Cabin'], axis=1, inplace=True)









    



/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':

Ticket



In [11]:

    
#ticket_prefix = df.Ticket.map(lambda x: x[:2])
#This isn't necessary
#df['ticket_prefix'] = ticket_prefix
#pd.crosstab(ticket_prefix, df.Survived)

Name



In [12]:

    
#Let's take a look at some of the names
df.Name.head(4)









    Out[12]:





PassengerId
1                              Braund, Mr. Owen Harris
2    Cumings, Mrs. John Bradley (Florence Briggs Th...
3                               Heikkinen, Miss. Laina
4         Futrelle, Mrs. Jacques Heath (Lily May Peel)
Name: Name, dtype: object

Each name has a title (like Mrs or Mr), maybe that helps to predict survival.



In [13]:

    
#For each name, extract the title
name_title = df.Name.map(lambda name: re.search('.*,{1}\s{1}([a-zA-Z\s]+)\.{1}.*', name).group(1))
df['name_title'] = name_title
#Create a table to compare it with survival
np_tab = pd.crosstab(name_title, df.Survived)
np_tab









    Out[13]:






  
    
      Survived
      0.0
      1.0
    
    
      Name
      
      
    
  
  
    
      Capt
      1
      0
    
    
      Col
      1
      1
    
    
      Don
      1
      0
    
    
      Dr
      4
      3
    
    
      Jonkheer
      1
      0
    
    
      Lady
      0
      1
    
    
      Major
      1
      1
    
    
      Master
      17
      23
    
    
      Miss
      55
      127
    
    
      Mlle
      0
      2
    
    
      Mme
      0
      1
    
    
      Mr
      436
      81
    
    
      Mrs
      26
      99
    
    
      Ms
      0
      1
    
    
      Rev
      6
      0
    
    
      Sir
      0
      1
    
    
      the Countess
      0
      1

Seems like being a Mr does not help to survive (that's a proxy for 2-class male), on the other side being a Miss or Mrs helps a lot. Let's compute some features using this new information.



In [14]:

    
set(name_title)









    Out[14]:





{'Capt',
 'Col',
 'Don',
 'Dona',
 'Dr',
 'Jonkheer',
 'Lady',
 'Major',
 'Master',
 'Miss',
 'Mlle',
 'Mme',
 'Mr',
 'Mrs',
 'Ms',
 'Rev',
 'Sir',
 'the Countess'}



In [15]:

    
'''is_man = name_title.isin(['Capt', 'Don', 'Rev', 'Mr', 'Dr', 'Col', 'Major', 'Master', 'Ms'])
is_woman = name_title.isin(['Miss', 'Mrs', 'Dona'])
is_sir = name_title.isin(['Sir'])
is_lady = name_title.isin(['Jonkheer', 'Mme', 'Lady', 'Mlle', 'the Countess'])

name_title[is_man] = 'man'
name_title[is_woman] = 'woman'
name_title[is_sir] = 'sir'
name_title[is_lady] = 'lady'
df['name_title'] = name_title'''









    Out[15]:





"is_man = name_title.isin(['Capt', 'Don', 'Rev', 'Mr', 'Dr', 'Col', 'Major', 'Master', 'Ms'])\nis_woman = name_title.isin(['Miss', 'Mrs', 'Dona'])\nis_sir = name_title.isin(['Sir'])\nis_lady = name_title.isin(['Jonkheer', 'Mme', 'Lady', 'Mlle', 'the Countess'])\n\nname_title[is_man] = 'man'\nname_title[is_woman] = 'woman'\nname_title[is_sir] = 'sir'\nname_title[is_lady] = 'lady'\ndf['name_title'] = name_title"



In [16]:

    
sums =  np_tab.apply(lambda row: row[0]+row[1], axis=1)
n_passengers = df.shape[0]
np_tab['percentage_not_survived'] = np_tab.loc[:,0]*100/sums
np_tab['percentage_survived'] = np_tab.iloc[:,1]*100/sums
#np_tab.drop(np_tab.columns[[0,1]], axis=1, inplace=True)
np_tab = np_tab.sort(['percentage_survived'])
np_tab









    



/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:6: FutureWarning: sort(columns=....) is deprecated, use sort_values(by=.....)






    Out[16]:






  
    
      Survived
      0.0
      1.0
      percentage_not_survived
      percentage_survived
    
    
      Name
      
      
      
      
    
  
  
    
      Capt
      1
      0
      100.000000
      0.000000
    
    
      Don
      1
      0
      100.000000
      0.000000
    
    
      Jonkheer
      1
      0
      100.000000
      0.000000
    
    
      Rev
      6
      0
      100.000000
      0.000000
    
    
      Mr
      436
      81
      84.332689
      15.667311
    
    
      Dr
      4
      3
      57.142857
      42.857143
    
    
      Col
      1
      1
      50.000000
      50.000000
    
    
      Major
      1
      1
      50.000000
      50.000000
    
    
      Master
      17
      23
      42.500000
      57.500000
    
    
      Miss
      55
      127
      30.219780
      69.780220
    
    
      Mrs
      26
      99
      20.800000
      79.200000
    
    
      Mme
      0
      1
      0.000000
      100.000000
    
    
      Sir
      0
      1
      0.000000
      100.000000
    
    
      Ms
      0
      1
      0.000000
      100.000000
    
    
      Lady
      0
      1
      0.000000
      100.000000
    
    
      Mlle
      0
      2
      0.000000
      100.000000
    
    
      the Countess
      0
      1
      0.000000
      100.000000

Fare



In [17]:

    
df[df.Fare.isnull()]









    Out[17]:






  
    
      
      Age
      Embarked
      Fare
      Name
      Parch
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      dataset
      deck
      name_title
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1044
      60.5
      S
      NaN
      Storey, Mr. Thomas
      0
      3
      male
      0
      NaN
      3701
      test
      U
      Mr



In [18]:

    
df.boxplot(column='Fare', by='Pclass')









    Out[18]:





<matplotlib.axes._subplots.AxesSubplot at 0x1078b8810>



In [19]:

    
df.loc[1044,'Fare'] = df[df.Pclass==3]['Fare'].median()

Age



In [20]:

    
df[df.Fare.isnull()]['Fare'] = 1



In [21]:

    
df.Age.describe()









    Out[21]:





count    1046.000000
mean       29.881138
std        14.413493
min         0.170000
25%        21.000000
50%        28.000000
75%        39.000000
max        80.000000
Name: Age, dtype: float64



In [22]:

    
no_age = df[df.Age.isnull()]
no_age.shape









    Out[22]:





(263, 13)



In [23]:

    
df.boxplot(column='Age', by='Pclass')









    Out[23]:





<matplotlib.axes._subplots.AxesSubplot at 0x107837410>



In [24]:

    
df.boxplot(column='Age', by='Sex')









    Out[24]:





<matplotlib.axes._subplots.AxesSubplot at 0x107e0b310>

Let's use Pclass to estimate the age, using the median por each Pclass



In [25]:

    
median_ages = df[['Pclass','Age','Sex']].groupby(['Pclass','Sex']).median()
median_ages



In [26]:

    
def estimate_age(row):
    if pd.isnull(row.Age):
        return float(median_ages.ix[row.Pclass].ix[row.Sex])
    return row.Age



In [27]:

    
df['EstimatedAge'] = df.apply(estimate_age, axis=1)
df.drop('Age', axis=1, inplace=True)



In [28]:

    
df.head()









    Out[28]:






  
    
      
      Embarked
      Fare
      Name
      Parch
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      dataset
      deck
      name_title
      EstimatedAge
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      3
      male
      1
      0
      A/5 21171
      train
      U
      Mr
      22
    
    
      2
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      1
      female
      1
      1
      PC 17599
      train
      C
      Mrs
      38
    
    
      3
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      female
      0
      1
      STON/O2. 3101282
      train
      U
      Miss
      26
    
    
      4
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      1
      female
      1
      1
      113803
      train
      C
      Mrs
      35
    
    
      5
      S
      8.0500
      Allen, Mr. William Henry
      0
      3
      male
      0
      0
      373450
      train
      U
      Mr
      35

Embarked



In [29]:

    
df[df.Embarked.isnull()]









    Out[29]:






  
    
      
      Embarked
      Fare
      Name
      Parch
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      dataset
      deck
      name_title
      EstimatedAge
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      62
      NaN
      80
      Icard, Miss. Amelie
      0
      1
      female
      0
      1
      113572
      train
      B
      Miss
      38
    
    
      830
      NaN
      80
      Stone, Mrs. George Nelson (Martha Evelyn)
      0
      1
      female
      0
      1
      113572
      train
      B
      Mrs
      62



In [30]:

    
pd.crosstab(df.Embarked, df.Pclass)



In [31]:

    
pd.crosstab(df.Embarked, df.Survived)



In [32]:

    
df.loc[61,'Embarked'] = 'S'
df.loc[829,'Embarked'] = 'S'

SibSp and Parch



In [33]:

    
df['FamSize']  = df.SibSp + df.Parch
df[['FamSize', 'Survived']][df.dataset=='train'].groupby('FamSize').count()

Sex



In [34]:

    
df[['Sex', 'Survived']][df.dataset=='train'].groupby('Sex').count()

Interaction features - Fare/Age



In [35]:

    
#df['fare_over_age'] = df['Fare']/df['EstimatedAge']

Generate clean datasets



In [36]:

    
#Encode sex as dummies
sex_dummies = pd.get_dummies(df['Sex'])
df = df.join(sex_dummies)
#Encode embarked as a categorical variable
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='embarked')
df = df.join(embarked_dummies)
#Encode name_title as dummies
name_title_dummies = pd.get_dummies(df['name_title'])
df = df.join(name_title_dummies)
#Encode deck as dummies
deck_dummies = pd.get_dummies(df['deck'], prefix='deck')
df = df.join(deck_dummies)



In [37]:

    
df.head()









    Out[37]:






  
    
      
      Embarked
      Fare
      Name
      Parch
      Pclass
      Sex
      SibSp
      Survived
      Ticket
      dataset
      ...
      the Countess
      deck_A
      deck_B
      deck_C
      deck_D
      deck_E
      deck_F
      deck_G
      deck_T
      deck_U
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      S
      7.2500
      Braund, Mr. Owen Harris
      0
      3
      male
      1
      0
      A/5 21171
      train
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      C
      71.2833
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      0
      1
      female
      1
      1
      PC 17599
      train
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      3
      S
      7.9250
      Heikkinen, Miss. Laina
      0
      3
      female
      0
      1
      STON/O2. 3101282
      train
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      4
      S
      53.1000
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      0
      1
      female
      1
      1
      113803
      train
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      5
      S
      8.0500
      Allen, Mr. William Henry
      0
      3
      male
      0
      0
      373450
      train
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 46 columns



In [38]:

    
#Drop unnecessary variables
df.drop('Embarked', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.drop('Sex', axis=1, inplace=True)
df.drop('name_title', axis=1, inplace=True)
df.drop('deck', axis=1, inplace=True)



In [39]:

    
df.head()









    Out[39]:






  
    
      
      Fare
      Parch
      Pclass
      SibSp
      Survived
      dataset
      EstimatedAge
      FamSize
      female
      male
      ...
      the Countess
      deck_A
      deck_B
      deck_C
      deck_D
      deck_E
      deck_F
      deck_G
      deck_T
      deck_U
    
    
      PassengerId
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
      
    
  
  
    
      1
      7.2500
      0
      3
      1
      0
      train
      22
      1
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      2
      71.2833
      0
      1
      1
      1
      train
      38
      1
      1
      0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      3
      7.9250
      0
      3
      0
      1
      train
      26
      0
      1
      0
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
    
      4
      53.1000
      0
      1
      1
      1
      train
      35
      1
      1
      0
      ...
      0
      0
      0
      1
      0
      0
      0
      0
      0
      0
    
    
      5
      8.0500
      0
      3
      0
      0
      train
      35
      0
      0
      1
      ...
      0
      0
      0
      0
      0
      0
      0
      0
      0
      1
    
  

5 rows × 40 columns



In [40]:

    
train = df[df['dataset']=='train']
test = df[df['dataset']=='test']



In [41]:

    
train.drop('dataset', axis=1, inplace=True)
test.drop(['dataset','Survived'], axis=1, inplace=True)









    



/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
/Users/Edu/Envs/sklearn_model_eval/lib/python2.7/site-packages/ipykernel/__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [42]:

    
train.to_csv("train_clean.csv", index_label='PassengerId')
test.to_csv("test_clean.csv", index_label='PassengerId')



In [ ]:

	Age	Cabin	Embarked	Fare	Name	Parch	Pclass	Sex	SibSp	Survived	Ticket	dataset
PassengerId
1	22	NaN	S	7.2500	Braund, Mr. Owen Harris	0	3	male	1	0	A/5 21171	train
2	38	C85	C	71.2833	Cumings, Mrs. John Bradley (Florence Briggs Th...	0	1	female	1	1	PC 17599	train
3	26	NaN	S	7.9250	Heikkinen, Miss. Laina	0	3	female	0	1	STON/O2. 3101282	train
4	35	C123	S	53.1000	Futrelle, Mrs. Jacques Heath (Lily May Peel)	0	1	female	1	1	113803	train
5	35	NaN	S	8.0500	Allen, Mr. William Henry	0	3	male	0	0	373450	train

Survived	0.0	1.0
Name
Capt	1	0
Col	1	1
Don	1	0
Dr	4	3
Jonkheer	1	0
Lady	0	1
Major	1	1
Master	17	23
Miss	55	127
Mlle	0	2
Mme	0	1
Mr	436	81
Mrs	26	99
Ms	0	1
Rev	6	0
Sir	0	1
the Countess	0	1

Survived	0.0	1.0	percentage_not_survived	percentage_survived
Name
Capt	1	0	100.000000	0.000000
Don	1	0	100.000000	0.000000
Jonkheer	1	0	100.000000	0.000000
Rev	6	0	100.000000	0.000000
Mr	436	81	84.332689	15.667311
Dr	4	3	57.142857	42.857143
Col	1	1	50.000000	50.000000
Major	1	1	50.000000	50.000000
Master	17	23	42.500000	57.500000
Miss	55	127	30.219780	69.780220
Mrs	26	99	20.800000	79.200000
Mme	0	1	0.000000	100.000000
Sir	0	1	0.000000	100.000000
Ms	0	1	0.000000	100.000000
Lady	0	1	0.000000	100.000000
Mlle	0	2	0.000000	100.000000
the Countess	0	1	0.000000	100.000000

	Age	Embarked	Fare	Name	Parch	Pclass	Sex	SibSp	Survived	Ticket	dataset	deck	name_title
PassengerId
1044	60.5	S	NaN	Storey, Mr. Thomas	0	3	male	0	NaN	3701	test	U	Mr

		Age
Pclass	Sex
1	female	36.0
1	male	42.0
2	female	28.0
2	male	29.5
3	female	22.0
3	male	25.0

	Embarked	Fare	Name	Parch	Pclass	Sex	SibSp	Survived	Ticket	dataset	deck	name_title	EstimatedAge
PassengerId
62	NaN	80	Icard, Miss. Amelie	0	1	female	0	1	113572	train	B	Miss	38
830	NaN	80	Stone, Mrs. George Nelson (Martha Evelyn)	0	1	female	0	1	113572	train	B	Mrs	62