In [1]:
import pandas as pd
import numpy as np
import glob # to find all files in folder
from datetime import datetime
from datetime import date, time
from dateutil.parser import parse
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
sns.set_context('notebook')
pd.options.mode.chained_assignment = None # default='warn'
In [2]:
from IPython.core.display import HTML
HTML(filename='Data/titanic.html')
Out[2]:
In [3]:
original_data = pd.read_excel('Data/titanic.xls')
original_data['total'] = 1 # add a colon only consisting of 1s to make couting easier
original_data.head(2)
Out[3]:
In [4]:
pclass = original_data['pclass']
pclass.unique()
Out[4]:
there are 3 different classes
In [5]:
for c in pclass.unique():
print('nbr in class '+str(c)+': '+str(len(pclass[pclass == c])))
most are in class 3, but surprisingly class 1 has more passagengers than class 2
In [6]:
plt.hist(pclass.values)
Out[6]:
In [7]:
surv = original_data['survived']
surv.unique() # to make sure there are only 1 and 0
Out[7]:
In [8]:
#how many survived?
surv.sum()
Out[8]:
In [9]:
#how many died?
len(surv[surv == 0])
Out[9]:
most died :(
In [10]:
100/len(surv.values) * surv.sum()
Out[10]:
only 38% survived
In [11]:
name = original_data['name']
len(name.unique()) == len(name.values)
Out[11]:
apparently there are some with the same name
In [12]:
len(name.values) - len(name.unique())
Out[12]:
In [13]:
#lets find them
original_data[name.isin(name[name.duplicated()].values)]
Out[13]:
In [14]:
sex = original_data['sex']
sex.unique()
Out[14]:
In [15]:
nbr_males = len(sex[sex == 'male'])
In [16]:
nbr_females= len(sex[sex == 'female'])
In [17]:
100/len(sex) * nbr_males
Out[17]:
64.4% are male
In [18]:
age = original_data['age']
age.unique()
Out[18]:
There are NaN values! But also floating point values, which is somewhat unusual but not a problem per se.
In [19]:
age.min() # a baby?
Out[19]:
In [20]:
age.max()
Out[20]:
In [21]:
age.mean()
Out[21]:
Age distribution in a boxplot:
In [22]:
sns.boxplot(age.dropna().values)
Out[22]:
And the distribution of age plotted:
In [23]:
#plt.hist(age.values)
In [24]:
sipsp = original_data['sibsp']
sipsp.unique()
Out[24]:
In [25]:
sipsp.mean()
Out[25]:
Plot histogram: Almost all traveled without siblings or spouses. there is apparently one family that traveled together (8 siblings are on board)
In [26]:
plt.hist(sipsp)
Out[26]:
In [27]:
parch = original_data['parch']
parch.unique()
Out[27]:
In [28]:
parch.mean()
Out[28]:
Histogram: Again almost noone traveled with their kids. The one big family is again seen here.
In [29]:
plt.hist(parch)
Out[29]:
Let's find the family
In [30]:
# the kids
original_data[original_data['sibsp'] == 8]
Out[30]:
In [31]:
# the parents
original_data[original_data['parch'] == 9]
Out[31]:
This are the children and the parents of the 'big' familly. Sadly all died :(
In [32]:
ticket = original_data['ticket']
len(ticket.unique())
Out[32]:
In [33]:
ticket.dtype
Out[33]:
In [34]:
len(ticket[ticket.isnull()])
Out[34]:
All (registered) passengers had a ticket ;)
In [35]:
fare = original_data['fare']
fare.mean()
Out[35]:
In [36]:
fare.max()
Out[36]:
In [37]:
fare.min()
Out[37]:
There are people that did not pay anything
In [38]:
original_data[fare == 0]
Out[38]:
In [39]:
fare.dtypes
Out[39]:
In [40]:
original_data[fare.isnull()]
Out[40]:
there is one NaN value
In [41]:
plt.hist(fare.dropna())
Out[41]:
Someone got ripped of, or got the best room.
In [42]:
cabin = original_data['cabin']
cabin.isnull().sum()
Out[42]:
1014 people have no cabin (all class 3?)
In [43]:
plt.hist(original_data[cabin.isnull()]['pclass'])
Out[43]:
Even people in class 1 have no cabin (or it is unknown)
Some people have several cabines, but they are also occupied by several peoples, probablement families. It would be quite complicated to take those 'multiple cabin' entries appart. With more time we could have done it.
In [44]:
cabin.head()
Out[44]:
In [45]:
embarked = original_data['embarked']
embarked.unique()
Out[45]:
In [46]:
len(embarked[embarked.isnull()])
Out[46]:
two people have NaN in 'embarked'
In [47]:
sns.countplot(y="embarked", data=original_data, color="c");
In [48]:
boat = original_data['boat']
boat.unique()
Out[48]:
some have several boats.
In [49]:
body = original_data['body']
body.count()
Out[49]:
121 bodys got an number
In [50]:
homedest = original_data['home.dest']
len(homedest.dropna().unique())
Out[50]:
369 different home destinations Lets find the most common one
In [51]:
original_data[['home.dest', 'total']].groupby(by='home.dest').sum().sort_values(by='total', ascending=False)
Out[51]:
Most come from New York
First gather the numbers
In [52]:
survived_by_sex = original_data[['survived', 'sex']].groupby('sex').sum()
nbr_males = len(original_data[original_data['sex'] == 'male'])
nbr_females = len(original_data[original_data['sex'] == 'female'])
nbr_total = len(original_data['sex'])
survived_by_sex
Out[52]:
In [53]:
print(nbr_total == nbr_females + nbr_males) # to check if consistent
Then calcultate the percentages
In [54]:
female_survived_percentage = (100/nbr_females) * survived_by_sex.at['female', 'survived']
male_survived_percentage = (100/nbr_males) * survived_by_sex.at['male', 'survived']
print('female surv: '+str(round(female_survived_percentage, 3))+'%')
print('male surv: '+str(round(male_survived_percentage, 3))+'%')
In [55]:
# make use of the 'total' column (which is all 1's in the original_data)
survived_by_class = original_data[['pclass', 'sex', 'survived', 'total']].groupby(['pclass', 'sex']).sum()
survived_by_class
Out[55]:
In [56]:
def combine_surv_total(row):
#print(row)
return 100.0/row.total * row.survived
create a new column with the apply method
In [57]:
survived_by_class['survived in %'] = survived_by_class.apply(combine_surv_total, axis=1)
survived_by_class
Out[57]:
Here is a plot showing the survive rates. Note that the plot is not based on the data calculated above
In [67]:
type(original_data['sex'])
Out[67]:
In [58]:
sns.barplot(x='sex', y='survived', hue='pclass', data=original_data);
We can see that 'women first' is true, but also 'class 1 first'
Create the categories. We use the value -1 to show that the person has a NaN value as age (and put them in the category 'No age'.
In [59]:
original_data.age.fillna(-1, inplace=True)
age_cats = pd.cut(original_data.age, [-2, 0+1e-6,14+1e-6,20+1e-6,64+1e-6,120], labels=['No age', 'child','adolescent','adult','senior'], include_lowest=True)
In [60]:
original_data['age-category'] = age_cats
In [61]:
catsdata = original_data[['sex', 'age-category', 'pclass', 'survived', 'total']]
Then group the data in a sensible way to get the nice Table below.
In [62]:
grouped = catsdata.groupby(['sex', 'age-category', 'pclass']).sum().fillna(0)
grouped
Out[62]:
And finally calculate the survive portion for all cases
In [63]:
def surv_proportions(row):
if row.total == 0:
return np.nan
return round(100.0/row.total * row.survived, 2)
grouped['survive-portion (%)'] = grouped.apply(surv_proportions, axis=1)
In [64]:
grouped
Out[64]:
Two plots showing this. The first showing the female and the second shows the male passengers
In [65]:
sns.barplot(x="pclass", y="survived", hue="age-category", data=original_data[original_data['sex'] == 'female'])
Out[65]:
Almost all women from class 1 and 2 survived, in class 3 about 50% survived
In [66]:
sns.barplot(x="pclass", y="survived", hue="age-category", data=original_data[original_data['sex'] == 'male'])
Out[66]:
It is interesting to see that almost no men survived, exept children. So 'children before adults' was certainly a thing.
In [ ]: