In [25]:
!pwd


/Users/samantha/Documents/Personal/Samantha/GITSandbox/Kaggle_Titanic/rawcode

In [26]:
import numpy as np
import matplotlib as p
import pdb
from pandas import *

In [51]:
# Set some Pandas options
pandas.set_option('display.notebook_repr_html', False)
pandas.set_option('display.max_columns', 20)
pandas.set_option('display.max_rows', 25)
%matplotlib inline

In [28]:
train = read_csv('../data/train.csv')
train[:5]


Out[28]:
   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S  

In [29]:
train.shape


Out[29]:
(891, 12)

In [44]:
grouped = train.groupby(['Sex','SibSp'])

In [45]:
#test = grouped.Survived.aggregate(np.sum)
test = grouped.Survived.mean()
test


Out[45]:
Sex     SibSp
female  0        0.787356
        1        0.754717
        2        0.769231
        3        0.363636
        4        0.333333
        5        0.000000
        8        0.000000
male    0        0.168203
        1        0.310680
        2        0.200000
        3        0.000000
        4        0.083333
        5        0.000000
        8        0.000000
Name: Survived, dtype: float64

In [54]:
sex_effect = train.groupby('Sex').Survived.mean()
sex_effect


Out[54]:
Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64

In [63]:
sibling_effect = train.groupby('SibSp').Survived.mean()
sibling_effect.plot()


Out[63]:
<matplotlib.axes.AxesSubplot at 0x10dc1a790>

In [88]:
age_effect = train.groupby('Survived').Age.mean()
age_effect.plot(kind='bar')


Out[88]:
<matplotlib.axes.AxesSubplot at 0x112fde550>

In [85]:
parch_effect = train.groupby('Parch').Survived.mean()
parch_effect.plot(kind='line')


Out[85]:
<matplotlib.axes.AxesSubplot at 0x111c1ac50>

In [90]:
train.groupby(['Sex','Pclass']).Survived.sum().plot(kind='barh')


Out[90]:
<matplotlib.axes.AxesSubplot at 0x113a64c10>

In [99]:
survival_counts = crosstab([train.Sex, train.Pclass], train.Survived.astype(bool))
survival_counts.plot(kind='bar', stacked=True)
survival_counts
survival_counts.div(survival_counts.sum(1).astype(float), axis=0)
#survival_counts.div() .plot(kind='bar', stacked=True)


Out[99]:
Survived          False     True 
Sex    Pclass                    
female 1       0.031915  0.968085
       2       0.078947  0.921053
       3       0.500000  0.500000
male   1       0.631148  0.368852
       2       0.842593  0.157407
       3       0.864553  0.135447

In [101]:
train.columns


Out[101]:
Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age', u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'], dtype='object')

In [102]:
train.xs(0)


Out[102]:
PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                                 22
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object
Create new column for prediction

In [108]:
train['Prediction'] = 0

In [109]:
train.Prediction[train.Sex == 'female'] = 1
train.Prediction[train.Sex != 'female'] = 0
Rearrange colums

In [111]:
new_order =[11]+range(11)
train = train.reindex(columns=train.columns[new_order])
train.head()


Out[111]:
  Cabin Embarked  PassengerId  Survived  Pclass  \
0   NaN        S            1         0       3   
1   C85        C            2         1       1   
2   NaN        S            3         1       3   
3  C123        S            4         1       1   
4   NaN        S            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare  
0      0         A/5 21171   7.2500  
1      0          PC 17599  71.2833  
2      0  STON/O2. 3101282   7.9250  
3      0            113803  53.1000  
4      0            373450   8.0500  
write to a file

In [112]:
train.to_csv('genderbasedmodelpy.csv', index=False)