notebook.community

Edit and run



In [25]:

    
!pwd









    



/Users/samantha/Documents/Personal/Samantha/GITSandbox/Kaggle_Titanic/rawcode



In [26]:

    
import numpy as np
import matplotlib as p
import pdb
from pandas import *



In [51]:

    
# Set some Pandas options
pandas.set_option('display.notebook_repr_html', False)
pandas.set_option('display.max_columns', 20)
pandas.set_option('display.max_rows', 25)
%matplotlib inline



In [28]:

    
train = read_csv('../data/train.csv')
train[:5]









    Out[28]:





   PassengerId  Survived  Pclass  \
0            1         0       3   
1            2         1       1   
2            3         1       3   
3            4         1       1   
4            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare Cabin Embarked  
0      0         A/5 21171   7.2500   NaN        S  
1      0          PC 17599  71.2833   C85        C  
2      0  STON/O2. 3101282   7.9250   NaN        S  
3      0            113803  53.1000  C123        S  
4      0            373450   8.0500   NaN        S



In [29]:

    
train.shape









    Out[29]:





(891, 12)



In [44]:

    
grouped = train.groupby(['Sex','SibSp'])



In [45]:

    
#test = grouped.Survived.aggregate(np.sum)
test = grouped.Survived.mean()
test









    Out[45]:





Sex     SibSp
female  0        0.787356
        1        0.754717
        2        0.769231
        3        0.363636
        4        0.333333
        5        0.000000
        8        0.000000
male    0        0.168203
        1        0.310680
        2        0.200000
        3        0.000000
        4        0.083333
        5        0.000000
        8        0.000000
Name: Survived, dtype: float64



In [54]:

    
sex_effect = train.groupby('Sex').Survived.mean()
sex_effect









    Out[54]:





Sex
female    0.742038
male      0.188908
Name: Survived, dtype: float64



In [63]:

    
sibling_effect = train.groupby('SibSp').Survived.mean()
sibling_effect.plot()









    Out[63]:





<matplotlib.axes.AxesSubplot at 0x10dc1a790>



In [88]:

    
age_effect = train.groupby('Survived').Age.mean()
age_effect.plot(kind='bar')









    Out[88]:





<matplotlib.axes.AxesSubplot at 0x112fde550>



In [85]:

    
parch_effect = train.groupby('Parch').Survived.mean()
parch_effect.plot(kind='line')









    Out[85]:





<matplotlib.axes.AxesSubplot at 0x111c1ac50>



In [90]:

    
train.groupby(['Sex','Pclass']).Survived.sum().plot(kind='barh')









    Out[90]:





<matplotlib.axes.AxesSubplot at 0x113a64c10>



In [99]:

    
survival_counts = crosstab([train.Sex, train.Pclass], train.Survived.astype(bool))
survival_counts.plot(kind='bar', stacked=True)
survival_counts
survival_counts.div(survival_counts.sum(1).astype(float), axis=0)
#survival_counts.div() .plot(kind='bar', stacked=True)









    Out[99]:





Survived          False     True 
Sex    Pclass                    
female 1       0.031915  0.968085
       2       0.078947  0.921053
       3       0.500000  0.500000
male   1       0.631148  0.368852
       2       0.842593  0.157407
       3       0.864553  0.135447



In [101]:

    
train.columns









    Out[101]:





Index([u'PassengerId', u'Survived', u'Pclass', u'Name', u'Sex', u'Age', u'SibSp', u'Parch', u'Ticket', u'Fare', u'Cabin', u'Embarked'], dtype='object')



In [102]:

    
train.xs(0)









    Out[102]:





PassengerId                          1
Survived                             0
Pclass                               3
Name           Braund, Mr. Owen Harris
Sex                               male
Age                                 22
SibSp                                1
Parch                                0
Ticket                       A/5 21171
Fare                              7.25
Cabin                              NaN
Embarked                             S
Name: 0, dtype: object

Create new column for prediction



In [108]:

    
train['Prediction'] = 0



In [109]:

    
train.Prediction[train.Sex == 'female'] = 1
train.Prediction[train.Sex != 'female'] = 0

Rearrange colums



In [111]:

    
new_order =[11]+range(11)
train = train.reindex(columns=train.columns[new_order])
train.head()









    Out[111]:





  Cabin Embarked  PassengerId  Survived  Pclass  \
0   NaN        S            1         0       3   
1   C85        C            2         1       1   
2   NaN        S            3         1       3   
3  C123        S            4         1       1   
4   NaN        S            5         0       3   

                                                Name     Sex  Age  SibSp  \
0                            Braund, Mr. Owen Harris    male   22      1   
1  Cumings, Mrs. John Bradley (Florence Briggs Th...  female   38      1   
2                             Heikkinen, Miss. Laina  female   26      0   
3       Futrelle, Mrs. Jacques Heath (Lily May Peel)  female   35      1   
4                           Allen, Mr. William Henry    male   35      0   

   Parch            Ticket     Fare  
0      0         A/5 21171   7.2500  
1      0          PC 17599  71.2833  
2      0  STON/O2. 3101282   7.9250  
3      0            113803  53.1000  
4      0            373450   8.0500

write to a file



In [112]:

    
train.to_csv('genderbasedmodelpy.csv', index=False)