Data Exploration


In [18]:
# third party
import pandas
import matplotlib.pyplot as plot
import seaborn
from tabulate import tabulate

In [3]:
%matplotlib inline

In [5]:
train_data = pandas.read_csv('train.csv')

In [7]:
train_data.shape


Out[7]:
(891, 12)

In [40]:
describe_strings = train_data.describe(include=[object])
print(describe_strings.columns.values)
columns = list(describe_strings.columns.values)
columns = ['Statistics'] + columns


['Name' 'Sex' 'Ticket' 'Cabin' 'Embarked']

In [41]:
print(tabulate(describe_strings, headers=columns))


Statistics    Name                       Sex    Ticket    Cabin        Embarked
------------  -------------------------  -----  --------  -----------  ----------
count         891                        891    891       204          889
unique        891                        2      681       147          3
top           Graham, Mr. George Edward  male   CA. 2343  C23 C25 C27  S
freq          1                          577    7         4            644

In [42]:
describe_numbers = train_data.describe()
number_columns = ['Statistics'] + list(describe_numbers.columns)
print(tabulate(describe_numbers, headers=number_columns))


Statistics      PassengerId    Survived      Pclass       Age       SibSp       Parch      Fare
------------  -------------  ----------  ----------  --------  ----------  ----------  --------
count               891      891         891         714       891         891         891
mean                446        0.383838    2.30864    29.6991    0.523008    0.381594   32.2042
std                 257.354    0.486592    0.836071   14.5265    1.10274     0.806057   49.6934
min                   1        0           1           0.42      0           0           0
25%                 223.5      0           2          20.125     0           0           7.9104
50%                 446        0           3          28         0           0          14.4542
75%                 668.5      1           3          38         1           0          31
max                 891        1           3          80         8           6         512.329

In [52]:
missing_data = [column for column in train_data.columns if train_data[column].hasnans]
rows = float(len(train_data.PassengerId))
row_data = [(column, len(train_data[column].dropna())/rows) for column in missing_data]
print(tabulate(row_data, headers='Column,Fraction Available'.)))


Column      Fraction Available
--------  --------------------
Age                   0.801347
Cabin                 0.228956
Embarked              0.997755

Survival Rate


In [16]:
survived_counts = train_data.Survived.value_counts()
survival_rates = survived_counts/len(train_data.PassengerId) 
print(survived_counts)
print(survival_rates)


0    549
1    342
Name: Survived, dtype: int64
0    0.616162
1    0.383838
Name: Survived, dtype: float64

In [ ]: