notebook.community

Edit and run



In [7]:

    
%matplotlib inline
import pandas as pd
import matplotlib as plot
import numpy as np



In [12]:

    
def prepareData(data):
    # create boolean gender column
    data['Gender'] = data['Sex'].map( {'female': 0, 'male': 1} ).astype(int)

    # median age for each gender (rows) and each class (columns)
    median_ages = np.zeros((2,3))

    for i in range(0, 2):
        for j in range(0, 3):
            median_ages[i,j] = data[(data['Gender'] == i) & \
                                  (data['Pclass'] == j+1)]['Age'].dropna().median()
            
    # lets add some new useful columns
    data['AgeFill'] = data['Age']

    # fill in median ages
    for i in range(0, 2):
        for j in range(0, 3):
            data.loc[ (data.Age.isnull()) & (data.Gender == i) & (data.Pclass == j+1),\
                    'AgeFill'] = median_ages[i,j]

    data['AgeIsNull'] = pd.isnull(data.Age).astype(int)
    data['FamilySize'] = data['SibSp'] + data['Parch']
    data['Age*Class'] = data.AgeFill * data.Pclass
    data.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1, inplace=True)



In [13]:

    
train_df = pd.read_csv('data/train.csv', header = 0)
prepareData(train_df)

test_df = pd.read_csv('data/test.csv', header = 0)
prepareData(test_df)



In [14]:

    
histogram = train_df['AgeFill'].hist()
histogram.set_title('Age distribution')









    Out[14]:





<matplotlib.text.Text at 0x7fe521508390>



In [15]:

    
train_df.head(10)









    Out[15]:






  
    
      
      PassengerId
      Survived
      Pclass
      SibSp
      Parch
      Fare
      Gender
      AgeFill
      AgeIsNull
      FamilySize
      Age*Class
    
  
  
    
      0
      1
      0
      3
      1
      0
      7.2500
      1
      22
      0
      1
      66
    
    
      1
      2
      1
      1
      1
      0
      71.2833
      0
      38
      0
      1
      38
    
    
      2
      3
      1
      3
      0
      0
      7.9250
      0
      26
      0
      0
      78
    
    
      3
      4
      1
      1
      1
      0
      53.1000
      0
      35
      0
      1
      35
    
    
      4
      5
      0
      3
      0
      0
      8.0500
      1
      35
      0
      0
      105
    
    
      5
      6
      0
      3
      0
      0
      8.4583
      1
      25
      1
      0
      75
    
    
      6
      7
      0
      1
      0
      0
      51.8625
      1
      54
      0
      0
      54
    
    
      7
      8
      0
      3
      3
      1
      21.0750
      1
      2
      0
      4
      6
    
    
      8
      9
      1
      3
      0
      2
      11.1333
      0
      27
      0
      2
      81
    
    
      9
      10
      1
      2
      1
      0
      30.0708
      0
      14
      0
      1
      28



In [41]:

    
import warnings

# Import the random forest package
from sklearn.ensemble import RandomForestClassifier 

# disable nonfatal warnings
warnings.filterwarnings('ignore')

train_data = train_df.drop('PassengerId', axis=1).values
test_data = test_df.drop('PassengerId', axis=1).dropna().values


# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)

# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])

# Take the same decision trees and run it on the test data
output = forest.predict(test_data).astype(int)

ids = test_df['PassengerId'].values

results = pd.DataFrame(list(zip(ids, output)), columns=['PassengerId', 'WillSurvive'])
results.hist()









    Out[41]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fe518ed95c0>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe518cf40b8>]], dtype=object)



In [63]:

    
results.merge(test_df, on="PassengerId")[results['WillSurvive']  == 1][['AgeFill', 'Pclass', 'FamilySize']].hist()









    Out[63]:





array([[<matplotlib.axes._subplots.AxesSubplot object at 0x7fe51874d208>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe5184dbd30>],
       [<matplotlib.axes._subplots.AxesSubplot object at 0x7fe5185289e8>,
        <matplotlib.axes._subplots.AxesSubplot object at 0x7fe51845a400>]], dtype=object)



In [ ]:

	PassengerId	Survived	Pclass	SibSp	Parch	Fare	Gender	AgeFill	AgeIsNull	FamilySize	Age*Class
0	1	0	3	1	0	7.2500	1	22	0	1	66
1	2	1	1	1	0	71.2833	0	38	0	1	38
2	3	1	3	0	0	7.9250	0	26	0	0	78
3	4	1	1	1	0	53.1000	0	35	0	1	35
4	5	0	3	0	0	8.0500	1	35	0	0	105
5	6	0	3	0	0	8.4583	1	25	1	0	75
6	7	0	1	0	0	51.8625	1	54	0	0	54
7	8	0	3	3	1	21.0750	1	2	0	4	6
8	9	1	3	0	2	11.1333	0	27	0	2	81
9	10	1	2	1	0	30.0708	0	14	0	1	28