notebook.community

Edit and run



In [141]:

    
import pandas as pd
import numpy as np
import pylab as P
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 


# For .read_csv, always use header=0 when you know row 0 is the header row
df = pd.read_csv('data/titanic-kaggle/train.csv', header=0)
df.head()









    Out[141]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S



In [142]:

    
df.dtypes









    Out[142]:





PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object



In [143]:

    
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.6+ KB



In [144]:

    
df.describe()









    Out[144]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200

Convert Sex feature to numeric



In [145]:

    
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df.head()









    Out[145]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      1
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      0
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      0
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      0
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      1

Fill missing Age values

The histogram is skewed towards 20-30s, so cannot use average age.



In [146]:

    
df['Age'].dropna().hist(bins=16, range=(0,80), alpha = .5)
P.show()



In [147]:

    
median_ages = np.zeros((2,3))
median_ages









    Out[147]:





array([[ 0.,  0.,  0.],
       [ 0.,  0.,  0.]])



In [148]:

    
for i in range(0, 2):
    for j in range(0, 3):
        median_ages[i,j] = df[(df['Gender'] == i) & \
                              (df['Pclass'] == j+1)]['Age'].dropna().median()
 
median_ages









    Out[148]:





array([[ 35. ,  28. ,  21.5],
       [ 40. ,  30. ,  25. ]])



In [149]:

    
df['AgeFill'] = df['Age']
df.head()









    Out[149]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
      AgeFill
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      1
      22.0
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      0
      38.0
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      0
      26.0
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      0
      35.0
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      1
      35.0



In [150]:

    
df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']].head(10)



In [151]:

    
for i in range(0, 2):
    for j in range(0, 3):
        df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),\
                'AgeFill'] = median_ages[i,j]

df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head(10)



In [152]:

    
df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
df.describe()









    Out[152]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
      Gender
      AgeFill
      AgeIsNull
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
      0.647587
      29.112424
      0.198653
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
      0.477990
      13.304424
      0.399210
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
      0.000000
      0.420000
      0.000000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
      0.000000
      21.500000
      0.000000
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
      1.000000
      26.000000
      0.000000
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
      1.000000
      36.000000
      0.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200
      1.000000
      80.000000
      1.000000

Fill missing Embarked



In [153]:

    
df[df['Embarked'].isnull()][['Fare','Pclass','AgeFill','Embarked']].head(10)



In [154]:

    
df['Port'] = df['Embarked']
df.loc[df.Embarked.isnull(), 'Port'] = ['S', 'S']
df['Port'] = df['Port'].map( {'S': 1, 'Q': 2, 'C': 3} ).astype(int)
df.head()









    Out[154]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
      Gender
      AgeFill
      AgeIsNull
      Port
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22.0
      1
      0
      A/5 21171
      7.2500
      NaN
      S
      1
      22.0
      0
      1
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38.0
      1
      0
      PC 17599
      71.2833
      C85
      C
      0
      38.0
      0
      3
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26.0
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
      0
      26.0
      0
      1
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35.0
      1
      0
      113803
      53.1000
      C123
      S
      0
      35.0
      0
      1
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35.0
      0
      0
      373450
      8.0500
      NaN
      S
      1
      35.0
      0
      1

Feature Engineering

Since we know that Parch is the number of parents or children onboard, and SibSp is the number of siblings or spouses, we could collect those together as a FamilySize.



In [155]:

    
df['FamilySize'] = df['SibSp'] + df['Parch']



In [156]:

    
df['Age*Class'] = df.AgeFill * df.Pclass
df.describe()









    Out[156]:






  
    
      
      PassengerId
      Survived
      Pclass
      Age
      SibSp
      Parch
      Fare
      Gender
      AgeFill
      AgeIsNull
      Port
      FamilySize
      Age*Class
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      714.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      29.699118
      0.523008
      0.381594
      32.204208
      0.647587
      29.112424
      0.198653
      1.463524
      0.904602
      62.614860
    
    
      std
      257.353842
      0.486592
      0.836071
      14.526497
      1.102743
      0.806057
      49.693429
      0.477990
      13.304424
      0.399210
      0.791503
      1.613459
      31.362024
    
    
      min
      1.000000
      0.000000
      1.000000
      0.420000
      0.000000
      0.000000
      0.000000
      0.000000
      0.420000
      0.000000
      1.000000
      0.000000
      0.920000
    
    
      25%
      223.500000
      0.000000
      2.000000
      20.125000
      0.000000
      0.000000
      7.910400
      0.000000
      21.500000
      0.000000
      1.000000
      0.000000
      40.000000
    
    
      50%
      446.000000
      0.000000
      3.000000
      28.000000
      0.000000
      0.000000
      14.454200
      1.000000
      26.000000
      0.000000
      1.000000
      0.000000
      63.000000
    
    
      75%
      668.500000
      1.000000
      3.000000
      38.000000
      1.000000
      0.000000
      31.000000
      1.000000
      36.000000
      0.000000
      2.000000
      1.000000
      75.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      80.000000
      8.000000
      6.000000
      512.329200
      1.000000
      80.000000
      1.000000
      3.000000
      10.000000
      222.000000



In [157]:

    
df['Age*Class'].hist(bins=16, alpha=.5)
P.show()



In [158]:

    
df.dtypes[df.dtypes.map(lambda x: x=='object')]









    Out[158]:





Name        object
Sex         object
Ticket      object
Cabin       object
Embarked    object
dtype: object



In [159]:

    
df = df.drop(['Name', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked'], axis=1)
df.info()









    



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
SibSp          891 non-null int64
Parch          891 non-null int64
Fare           891 non-null float64
Gender         891 non-null int64
AgeFill        891 non-null float64
AgeIsNull      891 non-null int64
Port           891 non-null int64
FamilySize     891 non-null int64
Age*Class      891 non-null float64
dtypes: float64(3), int64(9)
memory usage: 83.6 KB



In [161]:

    
df.describe()









    Out[161]:






  
    
      
      PassengerId
      Survived
      Pclass
      SibSp
      Parch
      Fare
      Gender
      AgeFill
      AgeIsNull
      Port
      FamilySize
      Age*Class
    
  
  
    
      count
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
      891.000000
    
    
      mean
      446.000000
      0.383838
      2.308642
      0.523008
      0.381594
      32.204208
      0.647587
      29.112424
      0.198653
      1.463524
      0.904602
      62.614860
    
    
      std
      257.353842
      0.486592
      0.836071
      1.102743
      0.806057
      49.693429
      0.477990
      13.304424
      0.399210
      0.791503
      1.613459
      31.362024
    
    
      min
      1.000000
      0.000000
      1.000000
      0.000000
      0.000000
      0.000000
      0.000000
      0.420000
      0.000000
      1.000000
      0.000000
      0.920000
    
    
      25%
      223.500000
      0.000000
      2.000000
      0.000000
      0.000000
      7.910400
      0.000000
      21.500000
      0.000000
      1.000000
      0.000000
      40.000000
    
    
      50%
      446.000000
      0.000000
      3.000000
      0.000000
      0.000000
      14.454200
      1.000000
      26.000000
      0.000000
      1.000000
      0.000000
      63.000000
    
    
      75%
      668.500000
      1.000000
      3.000000
      1.000000
      0.000000
      31.000000
      1.000000
      36.000000
      0.000000
      2.000000
      1.000000
      75.000000
    
    
      max
      891.000000
      1.000000
      3.000000
      8.000000
      6.000000
      512.329200
      1.000000
      80.000000
      1.000000
      3.000000
      10.000000
      222.000000



In [160]:

    
train_data = df.values
train_data









    Out[160]:





array([[   1. ,    0. ,    3. , ...,    1. ,    1. ,   66. ],
       [   2. ,    1. ,    1. , ...,    3. ,    1. ,   38. ],
       [   3. ,    1. ,    3. , ...,    1. ,    0. ,   78. ],
       ..., 
       [ 889. ,    0. ,    3. , ...,    1. ,    3. ,   64.5],
       [ 890. ,    1. ,    1. , ...,    3. ,    0. ,   26. ],
       [ 891. ,    0. ,    3. , ...,    2. ,    0. ,   96. ]])



In [163]:

    
df_test = pd.read_csv('data/titanic-kaggle/test.csv', header=0)
df_test.describe()









    Out[163]:






  
    
      
      PassengerId
      Pclass
      Age
      SibSp
      Parch
      Fare
    
  
  
    
      count
      418.000000
      418.000000
      332.000000
      418.000000
      418.000000
      417.000000
    
    
      mean
      1100.500000
      2.265550
      30.272590
      0.447368
      0.392344
      35.627188
    
    
      std
      120.810458
      0.841838
      14.181209
      0.896760
      0.981429
      55.907576
    
    
      min
      892.000000
      1.000000
      0.170000
      0.000000
      0.000000
      0.000000
    
    
      25%
      996.250000
      1.000000
      21.000000
      0.000000
      0.000000
      7.895800
    
    
      50%
      1100.500000
      3.000000
      27.000000
      0.000000
      0.000000
      14.454200
    
    
      75%
      1204.750000
      3.000000
      39.000000
      1.000000
      0.000000
      31.500000
    
    
      max
      1309.000000
      3.000000
      76.000000
      8.000000
      9.000000
      512.329200



In [ ]:

    
df_test['Gender'] = df_test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)



In [ ]:

    
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

# Take the same decision trees and run it on the test data
output = forest.predict(test_data)

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22.0	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38.0	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26.0	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35.0	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35.0	0	373450	8.0500	NaN	S

	PassengerId	Survived	Pclass	Age	SibSp	Parch	Fare
count	891.000000	891.000000	891.000000	714.000000	891.000000	891.000000	891.000000
mean	446.000000	0.383838	2.308642	29.699118	0.523008	0.381594	32.204208
std	257.353842	0.486592	0.836071	14.526497	1.102743	0.806057	49.693429
min	1.000000	0.000000	1.000000	0.420000	0.000000	0.000000	0.000000
25%	223.500000	0.000000	2.000000	20.125000	0.000000	0.000000	7.910400
50%	446.000000	0.000000	3.000000	28.000000	0.000000	0.000000	14.454200
75%	668.500000	1.000000	3.000000	38.000000	1.000000	0.000000	31.000000
max	891.000000	1.000000	3.000000	80.000000	8.000000	6.000000	512.329200

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	NaN
17	1	2	NaN	NaN
19	0	3	NaN	NaN
26	1	3	NaN	NaN
28	0	3	NaN	NaN
29	1	3	NaN	NaN
31	0	1	NaN	NaN
32	0	3	NaN	NaN
36	1	3	NaN	NaN
42	1	3	NaN	NaN

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	25.0
17	1	2	NaN	30.0
19	0	3	NaN	21.5
26	1	3	NaN	25.0
28	0	3	NaN	21.5
29	1	3	NaN	25.0
31	0	1	NaN	35.0
32	0	3	NaN	21.5
36	1	3	NaN	25.0
42	1	3	NaN	25.0

	PassengerId	Pclass	Age	SibSp	Parch	Fare
count	418.000000	418.000000	332.000000	418.000000	418.000000	417.000000
mean	1100.500000	2.265550	30.272590	0.447368	0.392344	35.627188
std	120.810458	0.841838	14.181209	0.896760	0.981429	55.907576
min	892.000000	1.000000	0.170000	0.000000	0.000000	0.000000
25%	996.250000	1.000000	21.000000	0.000000	0.000000	7.895800
50%	1100.500000	3.000000	27.000000	0.000000	0.000000	14.454200
75%	1204.750000	3.000000	39.000000	1.000000	0.000000	31.500000
max	1309.000000	3.000000	76.000000	8.000000	9.000000	512.329200

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	NaN
17	1	2	NaN	NaN
19	0	3	NaN	NaN
26	1	3	NaN	NaN
28	0	3	NaN	NaN
29	1	3	NaN	NaN
31	0	1	NaN	NaN
32	0	3	NaN	NaN
36	1	3	NaN	NaN
42	1	3	NaN	NaN

	Gender	Pclass	Age	AgeFill
5	1	3	NaN	NaN
17	1	2	NaN	NaN
19	0	3	NaN	NaN
26	1	3	NaN	NaN
28	0	3	NaN	NaN
29	1	3	NaN	NaN
31	0	1	NaN	NaN
32	0	3	NaN	NaN
36	1	3	NaN	NaN
42	1	3	NaN	NaN