In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import data
data = pd.read_csv('train.csv')

In [3]:
print data.shape
data.head(1)


(891, 12)
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.25 NaN S

In [4]:
data.rename(columns={'Survived' : 'survived', 'Pclass' : 'pass_class', 'Sex' : 'sex', 'Age' : 'age', 'SibSp' : 'horiz_rel', 'Parch' : 'vert_rel', 'Ticket' : 'ticket', 'Fare' : 'fare', 'Cabin' : 'cabin', 'Embarked' : 'embarked'}, inplace=True)

In [5]:
pd.value_counts(data.survived)


Out[5]:
0    549
1    342
dtype: int64

In [6]:
pd.value_counts(data.sex)


Out[6]:
male      577
female    314
dtype: int64

In [7]:
pd.crosstab(data.sex, data.survived)


Out[7]:
survived 0 1
sex
female 81 233
male 468 109

In [8]:
fem = data[data.sex == 'female']
fem_total = fem.sex.count()
fem_surv = fem[fem.survived == 1]
fem_surv_total = fem_surv.sex.count()
print str(round((fem_surv_total/(fem_total*1.0))*100, 2)) + "% of females survived"


74.2% of females survived

In [9]:
fem_surv.age.median()


Out[9]:
28.0

In [10]:
fem_surv.age.describe()


Out[10]:
count    197.000000
mean      28.847716
std       14.175073
min        0.750000
25%       19.000000
50%       28.000000
75%       38.000000
max       63.000000
dtype: float64

In [11]:
male = data[data.sex == 'male']
male_total = male.sex.count()
male_surv = male[male.survived == 1]
male_surv_total = male_surv.sex.count()
print str(round((male_surv_total/(male_total*1.0))*100, 2)) + "% of males survived"


18.89% of males survived

In [12]:
pd.crosstab(data.pass_class, data.survived)


Out[12]:
survived 0 1
pass_class
1 80 136
2 97 87
3 372 119

In [73]:
df = ['pass_class','horiz_rel','vert_rel','fare']
sns.pairplot(data, vars=df, hue='survived', size=2.5, diag_kind='kde')


Out[73]:
<seaborn.axisgrid.PairGrid at 0x11db0e990>

In [14]:
pd.crosstab(data.horiz_rel, data.survived)


Out[14]:
survived 0 1
horiz_rel
0 398 210
1 97 112
2 15 13
3 12 4
4 15 3
5 5 0
8 7 0

In [15]:
pd.crosstab(data.vert_rel, data.survived)


Out[15]:
survived 0 1
vert_rel
0 445 233
1 53 65
2 40 40
3 2 3
4 4 0
5 4 1
6 1 0

In [16]:
#pd.crosstab(data.ticket, data.survived)

In [17]:
#pd.crosstab(data.fare, data.survived)
data.fare.describe()


Out[17]:
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
dtype: float64

In [18]:
#pd.crosstab(data.cabin, data.survived)

In [19]:
pd.crosstab(data.embarked, data.survived)


Out[19]:
survived 0 1
embarked
C 75 93
Q 47 30
S 427 217

In [20]:
plt.scatter(survived.age, survived.fare, marker='o', color='b')
plt.scatter(died.age, died.fare, marker='x', color='r')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-20-15636cd6e130> in <module>()
----> 1 plt.scatter(survived.age, survived.fare, marker='o', color='b')
      2 plt.scatter(died.age, died.fare, marker='x', color='r')
      3 plt.xlabel('Age')
      4 plt.ylabel('Fare')
      5 plt.show()

NameError: name 'survived' is not defined

In [ ]:
lesst40surv = survived[survived.age < 40]
print len(lesst40surv)
greatt40surv = survived[survived.age > 40]
print len(greatt40surv)

In [ ]:
lesst40died = died[died.age < 40]
print len(lesst40died)
greatt40died = died[died.age > 40]
print len(greatt40died)

In [ ]:
lesst40surv = survived[survived.fare < 30]
print len(lesst40surv)
greatt40surv = survived[survived.fare > 30]
print len(greatt40surv)

In [ ]:
lesst40died = died[died.fare < 30]
print len(lesst40died)
greatt40died = died[died.fare > 30]
print len(greatt40died)

Survivors tended to have one or more of the following attributes: female, younger (e.g., age < 40), higher-class.