In [1]:
# Import magic
%matplotlib inline
# More imports
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# Set up Seaborn
sns.set() # matplotlib defaults
In [2]:
# Load and show CSV data
titanic_data = pd.read_csv('titanic_data.csv')
titanic_data.head()
Out[2]:
survival Survival (0 = No; 1 = Yes) pclass Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd) name Name sex Sex age Age sibsp Number of Siblings/Spouses Aboard parch Number of Parents/Children Aboard ticket Ticket Number fare Passenger Fare cabin Cabin embarked Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
In [27]:
titanic_data.shape
Out[27]:
In [28]:
titanic_data.columns
Out[28]:
Cabin seems to have a lot of NaN spaces. How much information is there?
In [25]:
titanic_data[pd.notnull(titanic_data['Cabin'])].shape
Out[25]:
What is the distribution of class, port, gender?
In [97]:
first = second = third = 0
cher = queens = shamp = some = 0
male = female = 0
for index, row in titanic_data.iterrows():# Determine classes
if row['Pclass'] == 1:
first += 1
elif row['Pclass'] == 2:
second += 2
else:
third += 1
if row['Embarked'] == 'C':
cher += 1
elif row['Embarked'] == 'Q':
queens += 1
elif row['Embarked'] == 'S':
shamp +=1
else:
some += 1
if row['Sex'] == 'male':
male += 1
else:
female += 1
print '''\tFirst: %d
Second: %d
Third: %d \n
Cherbourg: %d
Queenstown: %d
Southampton: %d
nill: %d \n
Male: %d
Female: %d''' % (first, second, third, cher, queens, shamp, some, male, female)
In [61]:
titanic_data[pd.notnull(titanic_data['Age'])].shape
Out[61]:
In [44]:
titanic_data['Age'].describe()
Out[44]:
In [70]:
# Plot of non-null age
sns.distplot(titanic_data[pd.notnull(titanic_data['Age'])]['Age'], kde=False)
Out[70]:
In [82]:
married = titanic_data[((titanic_data['SibSp'] > 0) & (titanic_data['Age'] > 18))]
married.shape
Out[82]:
In [95]:
sns.distplot(married[pd.notnull(married['Age'])]['Age'], kde=False, color='r')
not_married = titanic_data[((titanic_data['SibSp'] == 0) & (titanic_data['Age'] > 18))]
sns.distplot(not_married[pd.notnull(not_married['Age'])]['Age'], kde=False)
Out[95]: