Knowing from a training set of samples listing passengers who survived or did not survive the Titanic disaster, can our model determine based on a given test dataset not containing the survival information, if these passengers in the test dataset survived or not.
On April 15, 1912, [...], the Titanic sank after colliding with an iceberg, killing 1502 out of 2224 passengers and crew. Translated 32% survival rate.
Although there was some element of luck involved in surviving the sinking, which groups of people were more likely to survive than others?
What type of Machine Learning problem are we facing?
Would I survive the Titanic disaster?
In [1]:
import pandas as pd
train_df = pd.read_csv('data/train.csv')
test_df = pd.read_csv('data/test.csv')
test_check_df = pd.read_csv('data/test-check.csv')
print('Train columns: ', train_df.columns.values, '\n')
print('Test columns: ', test_df.columns.values, '\n')
print('Test check columns: ', test_check_df.columns.values, '\n')
In [2]:
print('train_df.shape: ', train_df.shape)
print('test_df.shape: ', test_df.shape)
print('test_check_df.shape: ', test_check_df.shape)
In [3]:
train_df.head()
Out[3]:
In [4]:
train_df.tail()
Out[4]:
In [5]:
train_df.info()
In [6]:
train_df.describe()
Out[6]:
In [7]:
train_df[['Sex', 'Name', 'Embarked', 'Ticket', 'Cabin']].describe()
Out[7]:
In [8]:
train_df[train_df['Survived'] == 1]['Survived'].count() / train_df['Survived'].count()
Out[8]:
In [9]:
train_df[['Sex', 'Survived']].groupby(['Sex'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[9]:
In [10]:
train_df[['Pclass', 'Survived']].groupby(['Pclass'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[10]:
In [11]:
sex_and_class = train_df[['Pclass', 'Sex', 'Survived']].groupby(['Pclass', 'Sex'], as_index=False).mean()
sex_and_class.sort_values(by='Survived', ascending=False)
Out[11]:
In [12]:
train_df[['Embarked', 'Survived']].groupby(['Embarked'], as_index=False).mean()
Out[12]:
In [13]:
sex_and_class = train_df[['Pclass', 'Sex', 'Embarked', 'Survived']].groupby(['Pclass', 'Sex', 'Embarked'], as_index=False).mean()
sex_and_class.sort_values(by='Survived', ascending=False)
Out[13]:
In [14]:
train_df[['SibSp', 'Survived']].groupby(['SibSp'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[14]:
In [15]:
train_df[['Parch', 'Survived']].groupby(['Parch'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[15]:
In [16]:
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.FacetGrid(train_df, col='Survived').map(plt.hist, 'Age', bins=20)
Out[16]:
In [17]:
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', bins=20)
grid.add_legend();
In [18]:
grid = sns.FacetGrid(train_df, col='Survived', row='Sex', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Age', bins=20)
grid.add_legend();
In [19]:
train_df[['Fare', 'Survived']].groupby(['Survived'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[19]:
In [20]:
sns.FacetGrid(train_df, col='Survived').map(plt.hist, 'Fare', bins=10)
Out[20]:
In [21]:
sns.FacetGrid(train_df, col='Pclass').map(plt.hist, 'Fare', bins=10)
Out[21]:
In [22]:
grid = sns.FacetGrid(train_df, col='Survived', row='Pclass', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Fare', bins=10)
grid.add_legend();
In [23]:
train_df[['Fare', 'Sex']].groupby(['Sex'], as_index=False).mean().sort_values(by='Fare', ascending=False)
Out[23]:
In [24]:
grid = sns.FacetGrid(train_df, col='Survived', row='Sex', size=2.2, aspect=1.6)
grid.map(plt.hist, 'Fare', bins=10)
grid.add_legend();
In [25]:
ticket_df = train_df[['Ticket', 'PassengerId']].groupby(['Ticket']).count()
ticket_df[ticket_df['PassengerId'] > 1].describe()
Out[25]:
In [26]:
print(train_df[train_df['Cabin'].notnull()].Cabin.str.extract('^([A-Za-z])', expand=False).unique())
print(train_df[train_df['Cabin'].notnull()].Cabin.str.extract('^([A-Za-z])', expand=False).describe())
In [27]:
train_df = train_df.drop(['PassengerId'], axis=1)
test_df = test_df.drop(['PassengerId'], axis=1)
In [28]:
print('train_df: ', train_df.shape, train_df.columns.values)
print('test_df: ', test_df.shape, test_df.columns.values)
In [29]:
train_df['Sex'] = train_df['Sex'].map({ 'female': 0, 'male': 1 }).astype(int)
test_df['Sex'] = test_df['Sex'].map({ 'female': 0, 'male': 1 }).astype(int)
train_df.head()
Out[29]:
In [30]:
mean_age = train_df['Age'].mean()
mean_age
Out[30]:
In [31]:
train_df['Age'] = train_df['Age'].fillna(mean_age)
test_df['Age'] = test_df['Age'].fillna(mean_age)
train_df.info()
In [32]:
train_df['Age'] = train_df['Age'].astype(int)
test_df['Age'] = test_df['Age'].astype(int)
train_df.tail()
Out[32]:
In [33]:
most_freq_port = train_df['Embarked'].describe().top
most_freq_port
Out[33]:
In [34]:
train_df['Embarked'] = train_df['Embarked'].fillna(most_freq_port)
test_df['Embarked'] = test_df['Embarked'].fillna(most_freq_port)
train_df.info()
In [35]:
test_df = pd.get_dummies(test_df, columns=['Embarked'])
train_df = pd.get_dummies(train_df, columns=['Embarked'])
train_df.head()
Out[35]:
In [36]:
train_df['FamilySize'] = train_df['Parch'] + train_df['SibSp'] + 1
test_df['FamilySize'] = test_df['Parch'] + test_df['SibSp'] + 1
train_df.tail()
Out[36]:
In [37]:
train_df[['FamilySize', 'Survived']].groupby(['FamilySize'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[37]:
In [38]:
train_df['TravellingAlone'] = 0
train_df.loc[train_df['FamilySize'] == 1, 'TravellingAlone'] = 1
test_df['TravellingAlone'] = 0
test_df.loc[test_df['FamilySize'] == 1, 'TravellingAlone'] = 1
train_df[['FamilySize', 'TravellingAlone']].head(5)
Out[38]:
In [39]:
train_df['Fare'].describe()
Out[39]:
In [40]:
train_df.loc[train_df['Fare'] <= 7.9104, 'FareWeight'] = 1
train_df.loc[(train_df['Fare'] > 7.9104) & (train_df['Fare'] <= 14.4542), 'FareWeight'] = 2
train_df.loc[(train_df['Fare'] > 14.4542) & (train_df['Fare'] <= 31.0000), 'FareWeight'] = 4
train_df.loc[train_df['Fare'] > 31.000, 'FareWeight'] = 8
train_df['Fare'] = train_df['Fare'].astype(int)
train_df['FareWeight'] = train_df['FareWeight'].astype(int)
test_df['Fare'] = test_df['Fare'].fillna(32.204208)
test_df.loc[test_df['Fare'] <= 7.9104, 'FareWeight'] = 1
test_df.loc[(test_df['Fare'] > 7.9104) & (test_df['Fare'] <= 14.4542), 'FareWeight'] = 2
test_df.loc[(test_df['Fare'] > 14.4542) & (test_df['Fare'] <= 31.0000), 'FareWeight'] = 4
test_df.loc[test_df['Fare'] > 31.000, 'FareWeight'] = 8
test_df['Fare'] = test_df['Fare'].astype(int)
test_df['FareWeight'] = test_df['FareWeight'].astype(int)
test_df.head()
Out[40]:
In [41]:
train_df[['FareWeight', 'Survived']].groupby(['FareWeight'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[41]:
In [42]:
train_df['TicketOccurrences'] = 1
train_df['TicketOccurrences'] = train_df[['Ticket', 'TicketOccurrences']].groupby(['Ticket']).transform(lambda t: t.count())
test_df['TicketOccurrences'] = 1
test_df['TicketOccurrences'] = test_df[['Ticket', 'TicketOccurrences']].groupby(['Ticket']).transform(lambda t: t.count())
train_df = train_df.drop(['Ticket'], axis=1)
test_df = test_df.drop(['Ticket'], axis=1)
train_df.head(4)
Out[42]:
In [43]:
train_df[['TicketOccurrences', 'Survived']].groupby(['TicketOccurrences'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[43]:
In [44]:
train_df['Section'] = train_df['Cabin'].transform(lambda c: c.str.extract('^([A-Za-z])', expand=False))
test_df['Section'] = test_df['Cabin'].transform(lambda c: c.str.extract('^([A-Za-z])', expand=False))
train_df['Section'] = train_df['Section'].fillna('Unknown')
test_df['Section'] = test_df['Section'].fillna('Unknown')
test_df = pd.get_dummies(test_df, columns=['Section'], sparse=True)
train_df = pd.get_dummies(train_df, columns=['Section'], sparse=True)
train_df = train_df.drop(['Cabin'], axis=1)
test_df = test_df.drop(['Cabin'], axis=1)
train_df[['Section_A', 'Section_B', 'Section_C', 'Section_D', 'Section_E', 'Section_F', 'Section_G', 'Section_T', 'Section_Unknown']].head(5)
Out[44]:
In [45]:
# guarantee that all columns will be present on test_df as well
if 'Section_A' not in test_df.columns:
test_df['Section_A'] = 0
if 'Section_B' not in test_df.columns:
test_df['Section_B'] = 0
if 'Section_C' not in test_df.columns:
test_df['Section_C'] = 0
if 'Section_D' not in test_df.columns:
test_df['Section_D'] = 0
if 'Section_E' not in test_df.columns:
test_df['Section_E'] = 0
if 'Section_F' not in test_df.columns:
test_df['Section_F'] = 0
if 'Section_G' not in test_df.columns:
test_df['Section_G'] = 0
if 'Section_T' not in test_df.columns:
test_df['Section_T'] = 0
if 'Section_Unknown' not in test_df.columns:
test_df['Section_Unknown'] = 0
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)
print(test_df.columns)
In [46]:
train_df[['Section_A', 'Section_B', 'Section_C', 'Section_D', 'Section_E', 'Section_F', 'Section_G', 'Section_T', 'Section_Unknown', 'Survived']].groupby(['Survived'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[46]:
In [47]:
train_df['Title'] = train_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
test_df['Title'] = test_df.Name.str.extract(' ([A-Za-z]+)\.', expand=False)
train_df[['Title', 'Name']].head(5)
Out[47]:
In [48]:
train_df['Title'].unique()
Out[48]:
In [49]:
# synonyms
train_df['Title'] = train_df['Title'].replace('Ms', 'Miss')
train_df['Title'] = train_df['Title'].replace('Mme', 'Mrs')
train_df['Title'] = train_df['Title'].replace('Mlle', 'Miss')
test_df['Title'] = test_df['Title'].replace('Ms', 'Miss')
test_df['Title'] = test_df['Title'].replace('Mme', 'Mrs')
test_df['Title'] = test_df['Title'].replace('Mlle', 'Miss')
train_df['Title'].unique()
Out[49]:
In [50]:
train_df = train_df.drop('Name', axis=1)
test_df = test_df.drop('Name', axis=1)
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)
print(test_df.columns)
In [51]:
pd.crosstab(train_df['Title'], train_df['Survived'])
Out[51]:
In [52]:
train_df['Title'] = train_df['Title'].replace(['Lady', 'Countess', 'Sir'], 'Noble')
train_df['Title'] = train_df['Title'].replace(['Capt', 'Col', 'Major'], 'Military')
train_df['Title'] = train_df['Title'].replace(['Don', 'Dona', 'Dr', 'Rev', 'Jonkheer', 'Dona'], 'Other')
test_df['Title'] = test_df['Title'].replace(['Lady', 'Countess', 'Sir'], 'Noble')
test_df['Title'] = test_df['Title'].replace(['Capt', 'Col', 'Major'], 'Military')
test_df['Title'] = test_df['Title'].replace(['Don', 'Dona', 'Dr', 'Rev', 'Jonkheer', 'Dona'], 'Other')
pd.crosstab(train_df['Title'], train_df['Survived'])
Out[52]:
In [53]:
train_df[['Title', 'Survived']].groupby(['Title'], as_index=False).mean().sort_values(by='Survived', ascending=False)
Out[53]:
In [54]:
title_mapping = { "Mr": 1, "Other": 2, "Military": 4, "Master": 6, "Miss": 7, "Mrs": 8, "Noble": 10 }
train_df['TitleWeight'] = train_df['Title'].map(title_mapping).fillna(0)
test_df['TitleWeight'] = test_df['Title'].map(title_mapping).fillna(0)
train_df['TitleWeight']
Out[54]:
In [55]:
train_df = train_df.drop("Title", axis=1)
test_df = test_df.drop("Title", axis=1)
In [56]:
print(train_df.shape)
print(test_df.shape)
print(train_df.columns)
print(test_df.columns)
In [57]:
train_df[['Survived', 'Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare',
'Embarked_C', 'Embarked_Q', 'Embarked_S', 'FamilySize',
'TravellingAlone', 'FareWeight', 'TicketOccurrences']].head(5)
Out[57]:
In [58]:
train_df[['Section_A',
'Section_B', 'Section_C', 'Section_D', 'Section_E', 'Section_F',
'Section_G', 'Section_T', 'Section_Unknown', 'TitleWeight']].head(5)
Out[58]:
In [59]:
train_data = train_df.drop("Survived", axis=1)
train_label = train_df["Survived"]
test_data = test_df.copy()
train_data.shape, train_label.shape, test_data.shape
Out[59]:
In [60]:
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(train_data, train_label)
gaussian.score(train_data, train_label)
Out[60]:
In [61]:
gaussian.predict(test_data)[:10]
Out[61]:
In [62]:
gaussian.score(test_data, test_check_df.drop(['PassengerId'], axis=1))
Out[62]:
In [63]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_data, train_label)
random_forest.score(train_data, train_label)
Out[63]:
In [64]:
random_forest.predict(test_data)[:10]
Out[64]:
In [65]:
random_forest.score(test_data, test_check_df.drop(['PassengerId'], axis=1))
Out[65]:
In [66]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
rfe = RFE(estimator=SVR(kernel="linear"), n_features_to_select=10)
selector = rfe.fit(train_df.drop("Survived", axis=1), train_df["Survived"])
selector
Out[66]:
In [67]:
train_df.drop("Survived", axis=1).columns[selector.support_]
Out[67]:
In [68]:
train_data = train_df[train_df.drop("Survived", axis=1).columns[selector.support_]]
train_label = train_df["Survived"]
test_data = test_df[test_df.columns[selector.support_]].copy()
train_data.shape, train_label.shape, test_data.shape
Out[68]:
In [69]:
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(train_data, train_label)
gaussian.score(train_data, train_label), gaussian.score(test_data, test_check_df.drop(['PassengerId'], axis=1))
Out[69]:
In [70]:
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier(n_estimators=100)
random_forest.fit(train_data, train_label)
random_forest.score(train_data, train_label), random_forest.score(test_data, test_check_df.drop(['PassengerId'], axis=1))
Out[70]:
In [71]:
test_data.columns
Out[71]:
In [72]:
adriano_2006 = pd.DataFrame({
'Sex': [1], # male
'SibSp': [0], # alone
'FamilySize': [1], # alone
'TravellingAlone': [1], # alone
'Section_C': [0], # unknown
'Section_D': [0], # unknown
'Section_E': [0], # unknown
'Section_F': [0], # unknown
'Section_G': [0], # unknown
'TitleWeight': [6] # master
})
adriano_2006
Out[72]:
In [73]:
adriano_2018 = pd.DataFrame({
'Sex': [1], # male
'SibSp': [1], # married
'FamilySize': [2], # married
'TravellingAlone': [1], # married
'Section_C': [1], # cabin on section c
'Section_D': [0], # cabin on section c
'Section_E': [0], # cabin on section c
'Section_F': [0], # cabin on section c
'Section_G': [0], # cabin on section c
'TitleWeight': [1] # mr
})
adriano_2018
Out[73]:
In [74]:
prediction = pd.DataFrame({
'2006 Adriano': random_forest.predict(adriano_2006),
'2018 Adriano': random_forest.predict(adriano_2018)
}, index=['Survived']).transpose()
prediction
Out[74]:
In [75]:
prediction = pd.DataFrame({
'2006 Adriano': gaussian.predict(adriano_2006),
'2018 Adriano': gaussian.predict(adriano_2018)
}, index=['Survived']).transpose()
prediction
Out[75]:
In [76]:
prediction.to_csv('output/prediction.csv')