In [5]:
#Import pandas and matplotlib
import pandas as pd
import matplotlib.pyplot as plt
import re
#Render matplotlibs inside the notebook
%matplotlib inline
#Change default matplotlib style
pd.set_option('display.mpl_style', 'default')
plt.rcParams['figure.figsize'] = (15, 5)
In [6]:
#Load train and test data
df = pd.read_csv("raw_data/train.csv")
df['dataset'] = 'train'
test = pd.read_csv("raw_data/test.csv")
test['dataset'] = 'test'
#Merge dataframes (so we apply the same transformations to both datasets)
df = df.append(test, ignore_index=True)
In [7]:
#Assign passenger id as index column
df.index = df['PassengerId']
#Drop passenger id column
df.drop(['PassengerId'], axis=1, inplace=True)
In [8]:
#Print columns information
df.info()
In [9]:
#Take a look at the data
df.head()
Out[9]:
In [10]:
df.Cabin[df.Cabin.isnull()] = 'U'
df['deck'] = df.Cabin.map(lambda x: x[0])
df.drop(['Cabin'], axis=1, inplace=True)
In [11]:
#ticket_prefix = df.Ticket.map(lambda x: x[:2])
#This isn't necessary
#df['ticket_prefix'] = ticket_prefix
#pd.crosstab(ticket_prefix, df.Survived)
In [12]:
#Let's take a look at some of the names
df.Name.head(4)
Out[12]:
Each name has a title (like Mrs or Mr), maybe that helps to predict survival.
In [13]:
#For each name, extract the title
name_title = df.Name.map(lambda name: re.search('.*,{1}\s{1}([a-zA-Z\s]+)\.{1}.*', name).group(1))
df['name_title'] = name_title
#Create a table to compare it with survival
np_tab = pd.crosstab(name_title, df.Survived)
np_tab
Out[13]:
Seems like being a Mr does not help to survive (that's a proxy for 2-class male), on the other side being a Miss or Mrs helps a lot. Let's compute some features using this new information.
In [14]:
set(name_title)
Out[14]:
In [15]:
'''is_man = name_title.isin(['Capt', 'Don', 'Rev', 'Mr', 'Dr', 'Col', 'Major', 'Master', 'Ms'])
is_woman = name_title.isin(['Miss', 'Mrs', 'Dona'])
is_sir = name_title.isin(['Sir'])
is_lady = name_title.isin(['Jonkheer', 'Mme', 'Lady', 'Mlle', 'the Countess'])
name_title[is_man] = 'man'
name_title[is_woman] = 'woman'
name_title[is_sir] = 'sir'
name_title[is_lady] = 'lady'
df['name_title'] = name_title'''
Out[15]:
In [16]:
sums = np_tab.apply(lambda row: row[0]+row[1], axis=1)
n_passengers = df.shape[0]
np_tab['percentage_not_survived'] = np_tab.loc[:,0]*100/sums
np_tab['percentage_survived'] = np_tab.iloc[:,1]*100/sums
#np_tab.drop(np_tab.columns[[0,1]], axis=1, inplace=True)
np_tab = np_tab.sort(['percentage_survived'])
np_tab
Out[16]:
In [17]:
df[df.Fare.isnull()]
Out[17]:
In [18]:
df.boxplot(column='Fare', by='Pclass')
Out[18]:
In [19]:
df.loc[1044,'Fare'] = df[df.Pclass==3]['Fare'].median()
In [20]:
df[df.Fare.isnull()]['Fare'] = 1
In [21]:
df.Age.describe()
Out[21]:
In [22]:
no_age = df[df.Age.isnull()]
no_age.shape
Out[22]:
In [23]:
df.boxplot(column='Age', by='Pclass')
Out[23]:
In [24]:
df.boxplot(column='Age', by='Sex')
Out[24]:
Let's use Pclass to estimate the age, using the median por each Pclass
In [25]:
median_ages = df[['Pclass','Age','Sex']].groupby(['Pclass','Sex']).median()
median_ages
Out[25]:
In [26]:
def estimate_age(row):
if pd.isnull(row.Age):
return float(median_ages.ix[row.Pclass].ix[row.Sex])
return row.Age
In [27]:
df['EstimatedAge'] = df.apply(estimate_age, axis=1)
df.drop('Age', axis=1, inplace=True)
In [28]:
df.head()
Out[28]:
In [29]:
df[df.Embarked.isnull()]
Out[29]:
In [30]:
pd.crosstab(df.Embarked, df.Pclass)
Out[30]:
In [31]:
pd.crosstab(df.Embarked, df.Survived)
Out[31]:
In [32]:
df.loc[61,'Embarked'] = 'S'
df.loc[829,'Embarked'] = 'S'
In [33]:
df['FamSize'] = df.SibSp + df.Parch
df[['FamSize', 'Survived']][df.dataset=='train'].groupby('FamSize').count()
Out[33]:
In [34]:
df[['Sex', 'Survived']][df.dataset=='train'].groupby('Sex').count()
Out[34]:
In [35]:
#df['fare_over_age'] = df['Fare']/df['EstimatedAge']
In [36]:
#Encode sex as dummies
sex_dummies = pd.get_dummies(df['Sex'])
df = df.join(sex_dummies)
#Encode embarked as a categorical variable
embarked_dummies = pd.get_dummies(df['Embarked'], prefix='embarked')
df = df.join(embarked_dummies)
#Encode name_title as dummies
name_title_dummies = pd.get_dummies(df['name_title'])
df = df.join(name_title_dummies)
#Encode deck as dummies
deck_dummies = pd.get_dummies(df['deck'], prefix='deck')
df = df.join(deck_dummies)
In [37]:
df.head()
Out[37]:
In [38]:
#Drop unnecessary variables
df.drop('Embarked', axis=1, inplace=True)
df.drop('Name', axis=1, inplace=True)
df.drop('Ticket', axis=1, inplace=True)
df.drop('Sex', axis=1, inplace=True)
df.drop('name_title', axis=1, inplace=True)
df.drop('deck', axis=1, inplace=True)
In [39]:
df.head()
Out[39]:
In [40]:
train = df[df['dataset']=='train']
test = df[df['dataset']=='test']
In [41]:
train.drop('dataset', axis=1, inplace=True)
test.drop(['dataset','Survived'], axis=1, inplace=True)
In [42]:
train.to_csv("train_clean.csv", index_label='PassengerId')
test.to_csv("test_clean.csv", index_label='PassengerId')
In [ ]: