In [1]:
import os
os.getcwd()
Out[1]:
In [2]:
import numpy as np
import pandas as pd
TEST_DATA = 'train.csv'
titanic_dataframe = pd.read_csv(TEST_DATA, header=0)
print('length: {0} '.format(len(titanic_dataframe)))
titanic_dataframe.head(5)
Out[2]:
Average age of all Titanic passengers:
In [3]:
avg_all = np.mean(titanic_dataframe['Age'])
print(avg_all)
Average age of Titanic survivors:
In [4]:
avg_survive = np.mean(titanic_dataframe[titanic_dataframe.Survived==1].Age)
print(avg_survive)
Average age of non-surviving first class passenger:
In [5]:
avg_nonsurvivor_firstclass = np.mean(titanic_dataframe[(titanic_dataframe.Survived==0) & (titanic_dataframe.Pclass==1)].Age)
print(avg_nonsurvivor_firstclass)
Average age of male survivors over 30 from anywhere but Queenstown:
In [6]:
avg_male_survived_30_noq = np.mean(titanic_dataframe[(titanic_dataframe.Survived==1) & (titanic_dataframe.Sex=='male') &
(titanic_dataframe.Age > 30) & (titanic_dataframe.Embarked!='Q')].Age)
print(avg_male_survived_30_noq)
Difference between mean and median for all passengers, filling in NaN with mean
In [7]:
median_forall = np.median(titanic_dataframe[titanic_dataframe.Age > 0].Age)
print(median_forall - avg_all)
Difference between mean and median ages for survivors:
In [8]:
median_survivor = np.median(titanic_dataframe[(titanic_dataframe.Age >0) & (titanic_dataframe.Survived==1)].Age)
print(median_survivor - avg_survive)
Difference between mean and median ages for non surviving first class passengers:
In [9]:
median_first_dead = np.median(titanic_dataframe[(titanic_dataframe.Age >0) & (titanic_dataframe.Survived==0) &
(titanic_dataframe.Pclass==1) ].Age)
print(median_first_dead - avg_nonsurvivor_firstclass)
Difference between mean and median ages for survivor men over 30 not from Queenstown:
In [10]:
median_men_noq = np.median(titanic_dataframe[(titanic_dataframe.Survived==1) & (titanic_dataframe.Sex=='male') &
(titanic_dataframe.Age > 30) & (titanic_dataframe.Embarked!='Q')].Age)
print(median_men_noq - avg_male_survived_30_noq)
Most common passenger class:
In [11]:
titanic_dataframe['Pclass'].mode().item()
Out[11]:
Mode of Port of Embarkation:
In [12]:
titanic_dataframe['Embarked'].mode().item()
Out[12]:
Mode of siblings/spouses aboard:
In [13]:
titanic_dataframe['SibSp'].mode().item()
Out[13]:
The median ticket price is 0-1 stds from the mean. It is below the mean
In [14]:
np.median(titanic_dataframe['Fare'])
Out[14]:
In [15]:
np.mean(titanic_dataframe['Fare'])
Out[15]:
In [16]:
titanic_dataframe['Fare'].mode().item()
Out[16]:
In [17]:
np.std(titanic_dataframe['Fare'], ddof=1)
Out[17]:
The cost difference between the 90th and 5th percent tickets and their classes
In [18]:
nintieth_cost = titanic_dataframe['Fare'].quantile(.9)
fifth_cost = titanic_dataframe['Fare'].quantile(.05)
print(nintieth_cost - fifth_cost)
In [19]:
class_level = titanic_dataframe[(titanic_dataframe.Fare == titanic_dataframe['Fare'].quantile(.05))].Pclass
print(class_level)
In [20]:
upper_class = titanic_dataframe[(titanic_dataframe.Fare == titanic_dataframe['Fare'].quantile(.9))].Pclass
print(upper_class)
The port with the most expensive average ticket price is Cherbourg
In [21]:
south = np.mean(titanic_dataframe[(titanic_dataframe.Embarked == 'S')].Fare)
queens = np.mean(titanic_dataframe[(titanic_dataframe.Embarked == 'Q')].Fare)
cherb = np.mean(titanic_dataframe[(titanic_dataframe.Embarked == 'C')].Fare)
print(south, queens, cherb)
The port with the most similar passenger class is Queenstown because it has the smallest std
In [22]:
southie = np.std(titanic_dataframe[(titanic_dataframe.Embarked=='S')].Pclass)
queenie = np.std(titanic_dataframe[(titanic_dataframe.Embarked=='Q')].Pclass)
cherbie = np.std(titanic_dataframe[(titanic_dataframe.Embarked=='C')].Pclass)
print(southie, queenie, cherbie)
The average surviving passenger with family members was ~6.88 years younger than the average non surviving passenger with no family
In [23]:
avg_surv_w_fam = np.mean(titanic_dataframe[(titanic_dataframe.Survived==1) & ((titanic_dataframe.SibSp > 0) | (titanic_dataframe.Parch > 0))].Age)
avg_non_no_fam = np.mean(titanic_dataframe[(titanic_dataframe.Survived==0) & (titanic_dataframe.SibSp==0) & (titanic_dataframe.Parch==0)].Age)
print(avg_surv_w_fam-avg_non_no_fam)
Display the relationship (i.e. make a plot) between survival rate and the quantile of the ticket price for 20 integer quantiles. Make sure you label your axes.
In [26]:
import matplotlib.pyplot as pl
%matplotlib inline
fare_quantiles = np.percentile(titanic_dataframe['Fare'], np.arange(5, 105, 5.0))
survival_quantiles = []
latest_quantile = 0
for f_q in fare_quantiles:
folks= titanic_dataframe[(latest_quantile < titanic_dataframe.Fare) & (titanic_dataframe.Fare < f_q)]
survival_quantiles.append(len(folks[titanic_dataframe.Survived == 1]) / float(len(folks)))
graph = pl.plot(np.arange(5, 105, 5.0), survival_quantiles)
Surviving men who paid less than median fare price:
In [8]:
median_fare = np.median(titanic_dataframe.Fare)
surviving_first_less_median_fare = len(titanic_dataframe[(titanic_dataframe.Survived==1)
& (titanic_dataframe.Pclass==1)
& (titanic_dataframe.Sex=='male')
& (titanic_dataframe.Fare < median_fare)])
surviving_first = len(titanic_dataframe[(titanic_dataframe.Survived==1)
& (titanic_dataframe.Pclass==1)
& (titanic_dataframe.Sex=='male')])
print(surviving_first_less_median_fare)
print(surviving_first)
print(median_fare)
In [ ]: