In [25]:
!pwd
In [26]:
import numpy as np
import matplotlib as p
import pdb
from pandas import *
In [51]:
# Set some Pandas options
pandas.set_option('display.notebook_repr_html', False)
pandas.set_option('display.max_columns', 20)
pandas.set_option('display.max_rows', 25)
%matplotlib inline
In [28]:
train = read_csv('../data/train.csv')
train[:5]
Out[28]:
In [29]:
train.shape
Out[29]:
In [44]:
grouped = train.groupby(['Sex','SibSp'])
In [45]:
#test = grouped.Survived.aggregate(np.sum)
test = grouped.Survived.mean()
test
Out[45]:
In [54]:
sex_effect = train.groupby('Sex').Survived.mean()
sex_effect
Out[54]:
In [63]:
sibling_effect = train.groupby('SibSp').Survived.mean()
sibling_effect.plot()
Out[63]:
In [88]:
age_effect = train.groupby('Survived').Age.mean()
age_effect.plot(kind='bar')
Out[88]:
In [85]:
parch_effect = train.groupby('Parch').Survived.mean()
parch_effect.plot(kind='line')
Out[85]:
In [90]:
train.groupby(['Sex','Pclass']).Survived.sum().plot(kind='barh')
Out[90]:
In [99]:
survival_counts = crosstab([train.Sex, train.Pclass], train.Survived.astype(bool))
survival_counts.plot(kind='bar', stacked=True)
survival_counts
survival_counts.div(survival_counts.sum(1).astype(float), axis=0)
#survival_counts.div() .plot(kind='bar', stacked=True)
Out[99]:
In [101]:
train.columns
Out[101]:
In [102]:
train.xs(0)
Out[102]:
In [108]:
train['Prediction'] = 0
In [109]:
train.Prediction[train.Sex == 'female'] = 1
train.Prediction[train.Sex != 'female'] = 0
In [111]:
new_order =[11]+range(11)
train = train.reindex(columns=train.columns[new_order])
train.head()
Out[111]:
In [112]:
train.to_csv('genderbasedmodelpy.csv', index=False)