Exploring Titanic Dataset

Goals:

  • Make age distributions better/easier to work with for density plots

Questions:

  • How can density plots be used to show some factors of surviving?

In [54]:
# Import magic
%matplotlib inline

# More imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

#Set general plot properties
sns.set_style("white")

In [2]:
# Load and show CSV data
titanic_data = pd.read_csv('titanic_data.csv')

In [99]:
# Get non-null ages
titanic_data = titanic_data[pd.notnull(titanic_data['Age'])]

# Round age to integer
titanic_data.loc[:, 'Age'] = titanic_data['Age'].apply(lambda x : int(x))

grouped = titanic_data.groupby('Sex')

# Female grouped
f_grouped = grouped.get_group('female')
f_first = f_grouped[f_grouped['Pclass'] == 1]
f_second = f_grouped[f_grouped['Pclass'] == 2]
f_third = f_grouped[f_grouped['Pclass'] == 3]

# Male grouped
m_grouped = grouped.get_group('male')
m_first = m_grouped[m_grouped['Pclass'] == 1]
m_second = m_grouped[m_grouped['Pclass'] == 2]
m_third = m_grouped[m_grouped['Pclass'] == 3]

In [100]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True)

sns.distplot(f[f['Survived'] == 1]['Age'], kde=False, bins=20, ax=ax1)
sns.distplot(f[f['Survived'] != 1]['Age'], kde=False, color='r', bins=20, ax=ax1)

sns.distplot(s[s['Survived'] == 1]['Age'], kde=False, bins=20, ax=ax2)
sns.distplot(s[s['Survived'] != 1]['Age'], kde=False, color='r', bins=20, ax=ax2)

sns.distplot(t[t['Survived'] == 1]['Age'], kde=False, bins=20, ax=ax3)
sns.distplot(t[t['Survived'] != 1]['Age'], kde=False, color='r', bins=20, ax=ax3)


Out[100]:
<matplotlib.axes._subplots.AxesSubplot at 0x10d277150>

In [101]:
fig, (ax1, ax2, ax3) = plt.subplots(ncols=3, sharey=True)

sns.distplot(m_first[m_first['Survived'] == 1]['Age'], kde=False, bins=20, ax=ax1)
sns.distplot(m_first[m_first['Survived'] != 1]['Age'], kde=False, color='r', bins=20, ax=ax1)

sns.distplot(m_second[m_second['Survived'] == 1]['Age'], kde=False, bins=20, ax=ax2)
sns.distplot(m_second[m_second['Survived'] != 1]['Age'], kde=False, color='r', bins=20, ax=ax2)

sns.distplot(m_third[m_third['Survived'] == 1]['Age'], kde=False, bins=20, ax=ax3)
sns.distplot(m_third[m_third['Survived'] != 1]['Age'], kde=False, color='r', bins=20, ax=ax3)


Out[101]:
<matplotlib.axes._subplots.AxesSubplot at 0x10db2b090>

In [ ]: