In [33]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import math as m
from scipy.stats.stats import pearsonr
%matplotlib inline
In [34]:
#import files
titanic = pd.read_csv('titanic_data.csv')
In [35]:
titanic.head()
Out[35]:
In [36]:
# Likelihood to survive if you are a male / female
def prob_to_survive(x,y):
varx = titanic[titanic[x] == y]
return varx[varx['Survived'] == 1].count() / varx.count()
print prob_to_survive('Sex','male')[0]
print prob_to_survive('Sex','female')[0]
Your likelihood to survive was way higher if you were a women.
In [37]:
# Likelihood to survive according to your Pclass
print prob_to_survive('Pclass',1)[0]
print prob_to_survive('Pclass',2)[0]
print prob_to_survive('Pclass',3)[0]
Your likelihood to survive was way higher if you were coming from a higher socio-economic status.
In [38]:
# Likelihood to survive if you were in family or not
titanic['inFamily'] = np.where(titanic['Parch'] > 0, 'Yes','No')
print prob_to_survive('inFamily','Yes')[0]
print prob_to_survive('inFamily','No')[0]
Your likelihood to survive was higher if you came to the titanic with family.
In [39]:
# Likelihood to survive if you were in family or not
titanic['inCouple'] = np.where(titanic['SibSp'] > 0, 'Yes','No')
print prob_to_survive('inCouple','Yes')[0]
print prob_to_survive('inCouple','No')[0]
Your likelihood to survive was more importante if you came to the titanic as a couple.
In [40]:
#vizualisation of the data to get a first understanding
titanic_age_viz = titanic[['Survived','Age']].dropna()
ax = sns.violinplot(x="Survived", y="Age", data=titanic_age_viz)
The violin plot chart shows us that the survived violin is larger on the bottom and on the top. It's hard to conclude / draw a strong hypothesis looking at this chart. But because I think childs have been prioritized in boats, my hypothesis will be that the average age of survivors is lower than the average age of non-survivors.
In [41]:
#Data wrangling - creation and cleaning of my two samples
titanic_survivors = titanic[titanic['Survived'] == 1]
titanic_non_survivors = titanic[titanic['Survived'] == 0]
titanic_survivors = titanic_survivors[['Survived','Age']].dropna()
titanic_non_survivors = titanic_non_survivors[['Survived','Age']].dropna()
In [42]:
print titanic_survivors['Age'].count()
print titanic_non_survivors['Age'].count()
H0: μs = μns - The average age of survivors is not significantly different from the average age of non-survivors.
HA: μns < μs - The average age of survivors is significantly lower from the average age of non-survivors.
I will perform a independent-samples one tail t-test. I choose this test because:
I’ll use a critical statistic value of 0.05%.
In [43]:
print titanic_survivors['Age'].mean()
print titanic_non_survivors['Age'].mean()
In [44]:
t = (titanic_survivors['Age'].mean() - titanic_non_survivors['Age'].mean()) / m.sqrt((titanic_survivors['Age'].var() / titanic_survivors['Age'].count()) + (titanic_non_survivors['Age'].var() / titanic_non_survivors['Age'].count()))
t
Out[44]:
t(712) = -2.0460, p<.05, one-tailed
t-critical value = -1.646 with 712 degree of freedom.
Based on this t test, we can reject the null hypothesis and conclude that age of survivors is significantly lower than the age of non-survivors.
HA: μns < μs
Results match my expectations. Let's calculate r2 to see how much the age influenced the survival of an individual.
In [45]:
r = t**2/((t**2)+712)
r*100
Out[45]:
r2 indicates that even if the age of survivors is significantly lower than the age of non-survivors, that variable does not explained the variability of our independent variable, the survival of individuals in the ship.
We might then make the hypothesis that the sex was the variable having the higher impact on the survival of the individual, women might have been prioritized to take lifeboats, and women might be younger than man in the boat and might have take their child with them.
In [46]:
# Likelihood to survive according to your fare
In [47]:
titanic_fare_norm = (titanic['Fare'] - titanic['Fare'].mean()) / (titanic['Fare'].std(ddof=0))
titanic_survived_norm = (titanic['Survived'] - titanic['Survived'].mean()) / (titanic['Survived'].std(ddof=0))
print pearsonr(titanic_fare_norm,titanic_survived_norm)
There is a weak correlation between the fare you paid and your likelihood to survive.
In [48]:
titanic_fare_norm
titanic_status_norm = (titanic['Pclass'] - titanic['Pclass'].mean()) / (titanic['Pclass'].std(ddof=0))
print pearsonr(titanic_status_norm,titanic_fare_norm)
There is a moderate positive correlation between the social economic status of an individu and its likelihood to survive.
In [49]:
titanic_c = titanic[titanic['Embarked'] == 'C']
titanic_q = titanic[titanic['Embarked'] == 'Q']
titanic_s = titanic[titanic['Embarked'] == 'S']
print titanic_c.count()[0]
print titanic_q.count()[0]
print titanic_s.count()[0]
In [50]:
def two_variables_correl(x,y,z):
titanic_x = titanic[titanic['Embarked'] == x]
titanic_y_norm = (titanic_x[y] - titanic_x[y].mean()) / (titanic_x[y].std(ddof=0))
titanic_z_norm = (titanic_x[z] - titanic_x[z].mean()) / (titanic_x[z].std(ddof=0))
return pearsonr(titanic_y_norm,titanic_z_norm)
In [51]:
print two_variables_correl('C','Pclass','Fare')
print two_variables_correl('Q','Pclass','Fare')
print two_variables_correl('S','Pclass','Fare')
The correlation between the social economic status and the fare is moderate and quite similar for people who came from Cherbourg and Southampton. Regarding people who came from Queenstown, the correlation between these two variables is really strong.
In [52]:
#Data wrangling - dealing with missing Age values.
In [53]:
titanic_age = titanic[['Survived','Age']]
In [54]:
titanic_age_cleaned = titanic_age.dropna()
print titanic_age_cleaned.count()
In [55]:
t_age_graph = titanic_age_cleaned.groupby(['Survived']).hist(stacked=True, bins=20)
We can notice on the two charts above that the histogram plotting the age of survivors is more positively skewed than the histogram plotting the age of people who died.
In [56]:
titanic_age_family = titanic[['Age','Parch','Survived']].dropna()
In [68]:
titanic_alone_child = titanic_age_family[(titanic_age_family['Age'] <= 18) & (titanic_age_family['Parch'] < 1)]
titanic_not_alone_child = titanic_age_family[(titanic_age_family['Age'] <= 18) & (titanic_age_family['Parch'] > 1)]
print titanic_alone_child.Age.count()
t_alone_child = titanic_alone_child.Age.hist()
50 childs were into the titanic without any parents.
In [58]:
titanic_alone_child.groupby(['Survived']).size()
Out[58]:
In [59]:
titanic_not_alone_child.groupby(['Survived']).size()
Out[59]:
A child alone has less chance to survive than a regular child on the Titanic. Only 40% of childs without any parent survived during the tragedy whereas 50% of childs with a parent on the boat survived.
In [60]:
titanic['Fare'].plot.box()
Out[60]:
In [61]:
titanic[np.abs(titanic['Fare']-titanic['Fare'].mean())>=(3*titanic['Fare'].std())].count()
Out[61]:
In [62]:
outliers = titanic[np.abs(titanic['Fare']-titanic['Fare'].mean())>=(3*titanic['Fare'].std())]
outliers.head()
Out[62]:
In [63]:
outliers.groupby(['Survived']).size()
Out[63]:
70% of outliers survived during the sinking but because of the small size of the sample, it's really hard to conclude that this is significant.
In [ ]: