Describe the data.
In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import matplotlib.pyplot as plt
import random
%matplotlib inline
In [2]:
titanic = pd.read_csv('titanic.csv')
In [3]:
titanic.shape
Out[3]:
In [4]:
titanic.dtypes
Out[4]:
In [5]:
titanic.describe(include='all')
Out[5]:
What’s the average age of:
In [6]:
print('Age average:', titanic['Age'].mean())
print('Survivor age average:', titanic.where(titanic['Survived'] == 1)['Age'].mean())
print('Non-surviving first-class age average:', titanic.where((titanic['Survived'] == 0) & (titanic['Pclass'] == 1))['Age'].mean())
print('Male survivors older than 30 not from Queenstown age average:', titanic.where((titanic['Sex'] == 'male') & (titanic['Survived'] == 1) & (titanic['Age'] > 30) & (titanic['Embarked'] != 'Q'))['Age'].mean())
For the groups from the previous task, how far (in years) are the average ages from the median ages?
In [7]:
print(titanic['Age'].mean() - titanic['Age'].median())
print(titanic.where(titanic['Survived'] == 1)['Age'].mean() - titanic.where(titanic['Survived'] == 1)['Age'].median())
print(titanic.where((titanic['Survived'] == 0) & (titanic['Pclass'] == 1))['Age'].mean() - titanic.where((titanic['Survived'] == 0) & (titanic['Pclass'] == 1))['Age'].median())
print(titanic.where((titanic['Sex'] == 'male') & (titanic['Survived'] == 1) & (titanic['Age'] > 30) & (titanic['Embarked'] != 'Q'))['Age'].mean() - titanic.where((titanic['Sex'] == 'male') & (titanic['Survived'] == 1) & (titanic['Age'] > 30) & (titanic['Embarked'] != 'Q'))['Age'].median())
What’s the most common:
In [8]:
print('Most common passenger class:', titanic['Pclass'].mode()[0])
print('Most common port of embarkation:', titanic['Embarked'].mode()[0])
print('Most common number of siblings/spouses for survivors:', titanic[titanic['Survived'] == 1]['SibSp'].mode()[0])
Within what range of standard deviations from the mean (0-1, 1-2, 2-3) is the median ticket price? Is it above or below the mean?
It's between 0 and 1 standard deviations and below the mean:
In [9]:
print((titanic['Fare'].mean() - titanic['Fare'].median()) / titanic['Fare'].std())
print(titanic['Fare'].mean() > titanic['Fare'].median())
How much more expensive was the 90th percentile ticket than the 5th percentile ticket? Are they the same class?
In [10]:
perc5 = titanic['Fare'].quantile(0.05)
perc90 = titanic['Fare'].quantile(0.9)
print('5th percentile:', perc5)
print('Class of the 5th percentile:', titanic[titanic['Fare'] == perc5]['Pclass'].unique()[0])
print('90th percentile:', perc90)
print('Class of the 90th percentile:', titanic[titanic['Fare'] == perc90]['Pclass'].unique()[0])
The highest average ticket price was paid by passengers from which port? Null ports don’t count.
In [11]:
titanic.groupby('Embarked')['Fare'].mean().argmax()
Out[11]:
What is the most common passenger class for each port?
In [12]:
for port in titanic['Embarked'].dropna().unique():
print('Most common class for {}: {}'.format(port, titanic.where(titanic['Embarked'] == port)['Pclass'].mode()[0]))
What fraction of surviving 1st-class males paid lower than double the overall median ticket price?
In [13]:
titanic.where((titanic['Survived'] == 1) &
(titanic['Sex'] == 'male') &
(titanic['Pclass'] == 1) &
(titanic['Fare'] < 2 * titanic['Fare'].median())
)['PassengerId'].count() / titanic.where((titanic['Survived'] == 1) &
(titanic['Sex'] == 'male') &
(titanic['Pclass'] == 1))['PassengerId'].count()
Out[13]:
How much older/younger was the average surviving passenger with family members than the average non-surviving passenger without them?
In [14]:
print('Survivor with family members average age:' ,titanic.where((titanic['Survived'] == 1) & (titanic['SibSp'] + titanic['Parch'] > 0))['Age'].mean())
print('Non-survivor without family members average age:' ,titanic.where((titanic['Survived'] == 0) & (titanic['SibSp'] + titanic['Parch'] == 0))['Age'].mean())
Display the relationship (i.e. make a plot) between survival rate and the quantile of the ticket price for 20 integer quantiles.
In [36]:
import math
# Sort df by fare and reset index from 0
titanic_sortbyfare = titanic.sort_values('Fare').reset_index()
# Add column containing the quantile
titanic_sortbyfare['FareQuantile'] = titanic.index.values
titanic_sortbyfare['FareQuantile'] = titanic_sortbyfare['FareQuantile'].apply(lambda x: math.floor(x / math.ceil(len(titanic_sortbyfare) / 20)))
# Calculate survival rate
titanic_fareq = titanic_sortbyfare.groupby('FareQuantile')['Survived'].apply(lambda x : x.sum() / x.count())
In [37]:
with plt.style.context('seaborn'):
fig = plt.figure(figsize=(16, 6))
ax = plt.axes()
ax.plot(titanic_fareq)
# Set a locator and formatter for each quantile
ax.xaxis.set_major_locator(plt.FixedLocator(titanic_fareq.index.values))
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda val, pos: (val+1) / 20))
# Set x axis limits
ax.set_xlim(min(titanic_fareq.index.values), max(titanic_fareq.index.values))
# Add labels
ax.set_xlabel('Fare Quantile')
ax.set_ylabel('Survival Rate');
For each of the following characteristics, find the median in the data:
In [17]:
titanic[['Age', 'Fare', 'SibSp', 'Parch']].median()
Out[17]:
If you were to use these medians to draw numerical boundaries separating survivors from non-survivors, which of these characteristics would be the best choice and why?
In [83]:
# Function that calculates if a value is below the median of an attribute
def below_median(x, attr):
if x <= titanic_survrate[attr].median():
return 1
elif x > titanic_survrate[attr].median():
return 0
else:
return None
titanic_survrate = titanic[['Survived', 'Age', 'Fare', 'SibSp', 'Parch']].copy()
# Apply function to the attributes
titanic_survrate['AgeBelowMedian'] = titanic_survrate['Age'].apply(lambda x: below_median(x, 'Age'))
titanic_survrate['FareBelowMedian'] = titanic_survrate['Fare'].apply(lambda x: below_median(x, 'Fare'))
titanic_survrate['SibSpBelowMedian'] = titanic_survrate['SibSp'].apply(lambda x: below_median(x, 'SibSp'))
titanic_survrate['ParchBelowMedian'] = titanic_survrate['Parch'].apply(lambda x: below_median(x, 'Parch'))
# Calculate the survival rate above and below the median
print(titanic_survrate.groupby('AgeBelowMedian')['Survived'].sum() / titanic_survrate.groupby('AgeBelowMedian')['Survived'].count())
print(titanic_survrate.groupby('FareBelowMedian')['Survived'].sum() / titanic_survrate.groupby('FareBelowMedian')['Survived'].count())
print(titanic_survrate.groupby('SibSpBelowMedian')['Survived'].sum() / titanic_survrate.groupby('SibSpBelowMedian')['Survived'].count())
print(titanic_survrate.groupby('ParchBelowMedian')['Survived'].sum() / titanic_survrate.groupby('ParchBelowMedian')['Survived'].count())
In [64]:
# This is from the solution
def survival_ratio(predicate):
series = titanic[predicate]
return len(series[series['Survived'] == True]) / len(series)
below_at_median = pd.Series(name='Surv. below/at the median')
above_median = pd.Series(name='Surv. above the median')
below_at_median['Age'] = survival_ratio(titanic['Age'] <= titanic['Age'].median())
below_at_median['Fare'] = survival_ratio(titanic['Fare'] <= titanic['Fare'].median())
below_at_median['SibSp'] = survival_ratio(titanic['SibSp'] == titanic['SibSp'].median())
below_at_median['Parch'] = survival_ratio(titanic['Parch'] == titanic['Parch'].median())
above_median['Age'] = survival_ratio(titanic['Age'] > titanic['Age'].median())
above_median['Fare'] = survival_ratio(titanic['Fare'] > titanic['Fare'].median())
above_median['SibSp'] = survival_ratio(titanic['SibSp'] > titanic['SibSp'].median())
above_median['Parch'] = survival_ratio(titanic['Parch'] > titanic['Parch'].median())
survival_median = pd.DataFrame([below_at_median, above_median],
columns=['Age', 'Fare', 'SibSp', 'Parch']).transpose()
survival_median['above - below'] = above_median - below_at_median
survival_median
Out[64]:
Plot the distribution of passenger ages. Choose visually-meaningful bin sizes and label your axes.
In [84]:
fig = plt.figure(figsize=(16, 8))
ax = plt.axes()
# Plot ages using 20 bins
ax.hist(titanic['Age'].dropna(), bins=20)
# Set labels
ax.set_xlabel('Age')
ax.set_ylabel('Number of Passengers')
# Set locators
ax.xaxis.set_major_locator(plt.MaxNLocator(20))
Find the probability that:
In [85]:
print('Survival probability:', titanic['Survived'].sum() / len(titanic))
print('Male probability:', titanic[titanic['Sex'] == 'male']['PassengerId'].count() / len(titanic))
print('Female and at least one sibling/spouse probability:', titanic[(titanic['Sex'] == 'female') & (titanic['SibSp'] > 0)]['PassengerId'].count() / len(titanic))
print('Survivor from Cherbourg probability:', titanic[titanic['Embarked'] == 'C']['PassengerId'].count() / len(titanic))
print('Less than 10 years old probability:', titanic[titanic['Age'] < 10]['PassengerId'].count() / len(titanic))
print('Between 25 and 40 years old probability:', titanic[(titanic['Age'] > 25) & (titanic['Age'] < 40)]['PassengerId'].count() / len(titanic))
print('Less than 20 or more than 50 years old probability:', titanic[(titanic['Age'] < 20) | (titanic['Age'] > 50)]['PassengerId'].count() / len(titanic))
Knowing nothing else about the passengers aside from the survival rate of the population (see question above), if I choose 100 passengers at random from the passenger list, what’s the probability that exactly 42 passengers survive?
In [87]:
survival_rate = titanic['Survived'].sum() / len(titanic)
stats.binom.pmf(42, 100, survival_rate)
Out[87]:
What’s the probability that at least 42 of those 100 passengers survive?
In [88]:
1 - stats.binom.cdf(41, 100, survival_rate)
Out[88]:
Take random samples of 100 passengers and find out how many you need before the fraction of those samples where at least 42 passengers survive matches the probability you calculated previously (within Δp≈0.05).
Answers will vary based on chosen seeds. What would happen if you drew every sample with the same seed?
Plot the survival fraction vs the number of random samples.
In [95]:
# Set the seed (if I used the same seed for every sample I would always get the same result,
# so the fraction of samples would always be the same, namely 0 or 1 depending on the sample)
random.seed(42)
# Set the target probability from above and Δp
target_prob = 0.2594
delta = 0.05
# Initialize list of fraction of samples with at least 42 survivors
# and counters for number of samples drawn and number of samples with at least 42 passengers
survival_frac = []
n_samples = 0
n_over = 0
# Iterate until the fraction of samples is within delta
while True:
# Take a new sample
n_samples += 1
samp = random.sample(set(np.arange(len(titanic))), 100)
# Check if survivors >= 42 and add to n_over
if titanic.iloc[samp, 1].sum() >= 42:
n_over += 1
# Calculate the fraction of samples
survival_frac.append(n_over / n_samples)
if abs(n_over / n_samples - target_prob) < delta:
break
In [96]:
survival_frac
Out[96]:
In [93]:
print('Number of samples needed:', len(survival_frac))
In [98]:
fig = plt.figure(figsize=(16, 6))
ax = plt.axes()
ax.plot(survival_frac)
# Set labels
ax.set_xlabel('Number of samples')
ax.set_ylabel('Samples with at least 42 survivors')
# Set x axis limits, locators and formatters
ax.set_xlim(0, len(survival_frac) - 1)
ax.xaxis.set_major_locator(plt.MaxNLocator(8))
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda value, pos: int(value + 1)))
# Add a reference line for the target probability
ax.hlines(0.2594, 0, len(survival_frac) - 1, colors='red', linewidth=.5);
Is there a statistically significant difference between:
Use a 95% confidence level.
The difference between the ages of male and female survivors is not statitically relevant (the p-value is above 0.4):
In [100]:
print('Male survivors age average:', titanic[(titanic['Sex'] == 'male') & (titanic['Survived'] == 1)]['Age'].mean())
print('Female survivors age average:', titanic[(titanic['Sex'] == 'female') & (titanic['Survived'] == 1)]['Age'].mean())
print(titanic[(titanic['Sex'] == 'male') & (titanic['Survived'] == 1)]['Age'].std())
print(titanic[(titanic['Sex'] == 'female') & (titanic['Survived'] == 1)]['Age'].std())
stats.ttest_ind(titanic[(titanic['Sex'] == 'male') & (titanic['Survived'] == 1)]['Age'].dropna(),
titanic[(titanic['Sex'] == 'female') & (titanic['Survived'] == 1)]['Age'].dropna(),
equal_var=False)
Out[100]:
The difference between the fares paid by passengers from Queenstown and Cherbourg is statistically significant (the p-value is less than 0.001):
In [102]:
print('Fares paid by passengers embarked in Queenstown average:', titanic[titanic['Embarked'] == 'Q']['Fare'].mean())
print('Fares paid by passengers embarked in Cherbourg average:', titanic[titanic['Embarked'] == 'C']['Fare'].mean())
print(titanic[titanic['Embarked'] == 'Q']['Fare'].std())
print(titanic[titanic['Embarked'] == 'C']['Fare'].std())
stats.ttest_ind(titanic[titanic['Embarked'] == 'Q']['Fare'].dropna(),
titanic[titanic['Embarked'] == 'C']['Fare'].dropna(),
equal_var=False)
Out[102]:
Accompany your p-values with histograms showing the distributions of both compared populations.
In [109]:
bins0_100 = range(0, 100, 10)
fig = plt.figure(figsize=(16, 6))
ax = plt.axes()
# Plot the distribution of ages for the two groups with 20 bins
ax.hist(titanic[(titanic['Sex'] == 'male') & (titanic['Survived'] == 1)]['Age'].dropna(), bins=bins0_100, color='green', alpha=.5, label='male')
ax.hist(titanic[(titanic['Sex'] == 'female') & (titanic['Survived'] == 1)]['Age'].dropna(), bins=bins0_100, color='yellow', alpha=.5, label='female')
# Add labels and title
ax.set_xlabel('Age')
ax.set_ylabel('Number of Survivors')
ax.set_title('Male and female survivors ages')
# Add legend
ax.legend();
In [110]:
bins0_600 = range(0, 600, 50)
fig = plt.figure(figsize=(16, 6))
ax = plt.axes()
# Plot the distribution of fares for the two groups with 20 bins
ax.hist(titanic[titanic['Embarked'] == 'Q']['Fare'].dropna(), bins=bins0_600, color='green', alpha=.5, label='Queenstown')
ax.hist(titanic[titanic['Embarked'] == 'C']['Fare'].dropna(), bins=bins0_600, color='yellow', alpha=.5, label='Cherbourg')
# Add labels and title
ax.set_xlabel('Fare')
ax.set_ylabel('Number of Survivors')
ax.set_title('Fares of passengers from Queenstown and Cherbourg')
# Add legend
ax.legend();
Did survivors pay more for their tickets than those that did not? Use a 95% confidence level.
The difference between the fares paid by survivors and non-survivors is statistically significant (the p-value is less than 0.001):
In [112]:
print('Survivors average fare:', titanic[titanic['Survived'] == 1]['Fare'].mean())
print('Non-survivors average fare:', titanic[titanic['Survived'] == 0]['Fare'].mean())
print(titanic[titanic['Survived'] == 1]['Fare'].std())
print(titanic[titanic['Survived'] == 0]['Fare'].std())
# One-sided, so divide p-vale by two
stats.ttest_ind(titanic[titanic['Survived'] == 1]['Fare'].dropna(),
titanic[titanic['Survived'] == 0]['Fare'].dropna(),
equal_var=False)
Out[112]:
Did a given first-class passenger have fewer family members on board than a given third-class passenger? Use a 95% confidence level.
The difference between the number of family member on board for first class passengers and third class passengers is not statitically relevant (the p-value is 0.02):
In [114]:
print('First class average number of family members:', (titanic[titanic['Pclass'] == 1]['Parch'] + titanic[titanic['Pclass'] == 1]['SibSp']).mean())
print('Third class average number of family members:', (titanic[titanic['Pclass'] == 3]['Parch'] + titanic[titanic['Pclass'] == 3]['SibSp']).mean())
print((titanic[titanic['Pclass'] == 1]['Parch'] + titanic[titanic['Pclass'] == 1]['SibSp']).std())
print((titanic[titanic['Pclass'] == 3]['Parch'] + titanic[titanic['Pclass'] == 3]['SibSp']).std())
# One-sided, so divide p-vale by two
stats.ttest_ind((titanic[titanic['Pclass'] == 1]['Parch'] + titanic[titanic['Pclass'] == 1]['SibSp']).dropna(),
(titanic[titanic['Pclass'] == 3]['Parch'] + titanic[titanic['Pclass'] == 3]['SibSp']).dropna(),
equal_var=False)
Out[114]: