In [18]:
# third party
import pandas
import matplotlib.pyplot as plot
import seaborn
from tabulate import tabulate
In [3]:
%matplotlib inline
In [5]:
train_data = pandas.read_csv('train.csv')
In [7]:
train_data.shape
Out[7]:
In [40]:
describe_strings = train_data.describe(include=[object])
print(describe_strings.columns.values)
columns = list(describe_strings.columns.values)
columns = ['Statistics'] + columns
In [41]:
print(tabulate(describe_strings, headers=columns))
In [42]:
describe_numbers = train_data.describe()
number_columns = ['Statistics'] + list(describe_numbers.columns)
print(tabulate(describe_numbers, headers=number_columns))
In [52]:
missing_data = [column for column in train_data.columns if train_data[column].hasnans]
rows = float(len(train_data.PassengerId))
row_data = [(column, len(train_data[column].dropna())/rows) for column in missing_data]
print(tabulate(row_data, headers='Column,Fraction Available'.)))
In [16]:
survived_counts = train_data.Survived.value_counts()
survival_rates = survived_counts/len(train_data.PassengerId)
print(survived_counts)
print(survival_rates)
In [ ]: