In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# import data
data = pd.read_csv('train.csv')

In [3]:
print data.shape
data.head(1)


(891, 12)
Out[3]:
PassengerId Survived Pclass Name Sex Age SibSp Parch Ticket Fare Cabin Embarked
0 1 0 3 Braund, Mr. Owen Harris male 22 1 0 A/5 21171 7.25 NaN S

In [4]:
data.rename(columns={'Survived' : 'survived', 'Pclass' : 'pass_class', 'Sex' : 'sex', 'Age' : 'age', 'SibSp' : 'horiz_rel', 'Parch' : 'vert_rel', 'Ticket' : 'ticket', 'Fare' : 'fare', 'Cabin' : 'cabin', 'Embarked' : 'embarked'}, inplace=True)

In [5]:
pd.value_counts(data.survived)


Out[5]:
0    549
1    342
dtype: int64

In [6]:
data[data['survived'] == 0].describe()


Out[6]:
PassengerId survived pass_class age horiz_rel vert_rel fare
count 549.000000 549 549.000000 424.000000 549.000000 549.000000 549.000000
mean 447.016393 0 2.531876 30.626179 0.553734 0.329690 22.117887
std 260.640469 0 0.735805 14.172110 1.288399 0.823166 31.388207
min 1.000000 0 1.000000 1.000000 0.000000 0.000000 0.000000
25% 211.000000 0 2.000000 21.000000 0.000000 0.000000 7.854200
50% 455.000000 0 3.000000 28.000000 0.000000 0.000000 10.500000
75% 675.000000 0 3.000000 39.000000 1.000000 0.000000 26.000000
max 891.000000 0 3.000000 74.000000 8.000000 6.000000 263.000000

In [29]:
#data[data['survived'] == 0].boxplot(column=['age'])

In [8]:
data[data['survived'] == 1].describe()


Out[8]:
PassengerId survived pass_class age horiz_rel vert_rel fare
count 342.000000 342 342.000000 290.000000 342.000000 342.000000 342.000000
mean 444.368421 1 1.950292 28.343690 0.473684 0.464912 48.395408
std 252.358840 0 0.863321 14.950952 0.708688 0.771712 66.596998
min 2.000000 1 1.000000 0.420000 0.000000 0.000000 0.000000
25% 250.750000 1 1.000000 19.000000 0.000000 0.000000 12.475000
50% 439.500000 1 2.000000 28.000000 0.000000 0.000000 26.000000
75% 651.500000 1 3.000000 36.000000 1.000000 1.000000 57.000000
max 890.000000 1 3.000000 80.000000 4.000000 5.000000 512.329200

In [9]:
data[data['survived'] == 1].boxplot(column=['age'])


Out[9]:
{'boxes': [<matplotlib.lines.Line2D at 0x10bc65690>],
 'caps': [<matplotlib.lines.Line2D at 0x10bc72650>,
  <matplotlib.lines.Line2D at 0x10bc72bd0>],
 'fliers': [<matplotlib.lines.Line2D at 0x10bc7e890>],
 'means': [],
 'medians': [<matplotlib.lines.Line2D at 0x10bc7e250>],
 'whiskers': [<matplotlib.lines.Line2D at 0x10bc65910>,
  <matplotlib.lines.Line2D at 0x10bc65fd0>]}

In [10]:
pd.value_counts(data.sex)


Out[10]:
male      577
female    314
dtype: int64

In [11]:
pd.crosstab(data.sex, data.survived)


Out[11]:
survived 0 1
sex
female 81 233
male 468 109

In [30]:
pd.crosstab(data.embarked, data.survived)


Out[30]:
survived 0 1
embarked
C 75 93
Q 47 30
S 427 217

In [13]:
fem = data[data.sex == 'female']
fem_total = fem.sex.count()
fem_surv = fem[fem.survived == 1]
fem_surv_total = fem_surv.sex.count()
print str(round((fem_surv_total/(fem_total*1.0))*100, 2)) + "% of females survived"


74.2% of females survived

In [14]:
fem_surv.age.median()


Out[14]:
28.0

In [15]:
fem_surv.age.describe()


Out[15]:
count    197.000000
mean      28.847716
std       14.175073
min        0.750000
25%       19.000000
50%       28.000000
75%       38.000000
max       63.000000
dtype: float64

In [16]:
male = data[data.sex == 'male']
male_total = male.sex.count()
male_surv = male[male.survived == 1]
male_surv_total = male_surv.sex.count()
print str(round((male_surv_total/(male_total*1.0))*100, 2)) + "% of males survived"


18.89% of males survived

In [17]:
pd.crosstab(data.pass_class, data.survived)


Out[17]:
survived 0 1
pass_class
1 80 136
2 97 87
3 372 119

In [18]:
pd.crosstab(data.pass_class, data.sex)


Out[18]:
sex female male
pass_class
1 94 122
2 76 108
3 144 347

In [40]:
df = ['pass_class','horiz_rel','vert_rel','fare']
#df = ['horiz_rel','vert_rel']
#sns.pairplot(data, vars=df, hue='survived', size=2.5, diag_kind='kde')
sns.pairplot(data[(data['sex'] == 'male') & (data['embarked'] == 'C')], vars=df, hue='survived', size=2.5, diag_kind='kde')


Out[40]:
<seaborn.axisgrid.PairGrid at 0x114bbc790>

In [20]:
data[data['survived'] == 0].groupby('age').size().plot(kind="bar");



In [21]:
data[data['survived'] == 1].groupby('age').size().plot(kind="bar");



In [22]:
pd.crosstab(data.horiz_rel, data.survived)


Out[22]:
survived 0 1
horiz_rel
0 398 210
1 97 112
2 15 13
3 12 4
4 15 3
5 5 0
8 7 0

In [23]:
pd.crosstab(data.vert_rel, data.survived)


Out[23]:
survived 0 1
vert_rel
0 445 233
1 53 65
2 40 40
3 2 3
4 4 0
5 4 1
6 1 0

In [24]:
#pd.crosstab(data.ticket, data.survived)

In [25]:
#pd.crosstab(data.fare, data.survived)
data.fare.describe()


Out[25]:
count    891.000000
mean      32.204208
std       49.693429
min        0.000000
25%        7.910400
50%       14.454200
75%       31.000000
max      512.329200
dtype: float64

In [26]:
#pd.crosstab(data.cabin, data.survived)

In [27]:
pd.crosstab(data.embarked, data.survived)


Out[27]:
survived 0 1
embarked
C 75 93
Q 47 30
S 427 217

In [28]:
plt.scatter(survived.age, survived.fare, marker='o', color='b')
plt.scatter(died.age, died.fare, marker='x', color='r')
plt.xlabel('Age')
plt.ylabel('Fare')
plt.show()


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-28-15636cd6e130> in <module>()
----> 1 plt.scatter(survived.age, survived.fare, marker='o', color='b')
      2 plt.scatter(died.age, died.fare, marker='x', color='r')
      3 plt.xlabel('Age')
      4 plt.ylabel('Fare')
      5 plt.show()

NameError: name 'survived' is not defined

In [ ]:
lesst40surv = survived[survived.age < 40]
print len(lesst40surv)
greatt40surv = survived[survived.age > 40]
print len(greatt40surv)

In [ ]:
lesst40died = died[died.age < 40]
print len(lesst40died)
greatt40died = died[died.age > 40]
print len(greatt40died)

In [ ]:
lesst40surv = survived[survived.fare < 30]
print len(lesst40surv)
greatt40surv = survived[survived.fare > 30]
print len(greatt40surv)

In [ ]:
lesst40died = died[died.fare < 30]
print len(lesst40died)
greatt40died = died[died.fare > 30]
print len(greatt40died)

Survivors tended to have one or more of the following attributes: female, first-class.