In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
# import data
data = pd.read_csv('train.csv')
In [3]:
print data.shape
data.head(1)
Out[3]:
In [4]:
data.rename(columns={'Survived' : 'survived', 'Pclass' : 'pass_class', 'Sex' : 'sex', 'Age' : 'age', 'SibSp' : 'horiz_rel', 'Parch' : 'vert_rel', 'Ticket' : 'ticket', 'Fare' : 'fare', 'Cabin' : 'cabin', 'Embarked' : 'embarked'}, inplace=True)
In [5]:
pd.value_counts(data.survived)
Out[5]:
In [6]:
pd.value_counts(data.sex)
Out[6]:
In [7]:
pd.crosstab(data.sex, data.survived)
Out[7]:
In [8]:
fem = data[data.sex == 'female']
fem_total = fem.sex.count()
fem_surv = fem[fem.survived == 1]
fem_surv_total = fem_surv.sex.count()
print str(round((fem_surv_total/(fem_total*1.0))*100, 2)) + "% of females survived"
In [9]:
fem_surv.age.median()
Out[9]:
In [10]:
fem_surv.age.describe()
Out[10]:
In [11]:
male = data[data.sex == 'male']
male_total = male.sex.count()
male_surv = male[male.survived == 1]
male_surv_total = male_surv.sex.count()
print str(round((male_surv_total/(male_total*1.0))*100, 2)) + "% of males survived"
In [12]:
pd.crosstab(data.pass_class, data.survived)
Out[12]:
In [73]:
df = ['pass_class','horiz_rel','vert_rel','fare']
sns.pairplot(data, vars=df, hue='survived', size=2.5, diag_kind='kde')
Out[73]:
In [14]:
pd.crosstab(data.horiz_rel, data.survived)
Out[14]:
In [15]:
pd.crosstab(data.vert_rel, data.survived)
Out[15]:
In [16]:
#pd.crosstab(data.ticket, data.survived)
In [17]:
#pd.crosstab(data.fare, data.survived)
data.fare.describe()
Out[17]:
In [18]:
#pd.crosstab(data.cabin, data.survived)
In [19]:
pd.crosstab(data.embarked, data.survived)
Out[19]:
In [20]:
plt.scatter(survived.age, survived.fare, marker='o', color='b')
plt.scatter(died.age, died.fare, marker='x', color='r')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()
In [ ]:
lesst40surv = survived[survived.age < 40]
print len(lesst40surv)
greatt40surv = survived[survived.age > 40]
print len(greatt40surv)
In [ ]:
lesst40died = died[died.age < 40]
print len(lesst40died)
greatt40died = died[died.age > 40]
print len(greatt40died)
In [ ]:
lesst40surv = survived[survived.fare < 30]
print len(lesst40surv)
greatt40surv = survived[survived.fare > 30]
print len(greatt40surv)
In [ ]:
lesst40died = died[died.fare < 30]
print len(lesst40died)
greatt40died = died[died.fare > 30]
print len(greatt40died)
Survivors tended to have one or more of the following attributes: female, younger (e.g., age < 40), higher-class.