In [7]:
%matplotlib inline
import pandas as pd
import matplotlib as plot
import numpy as np
In [12]:
def prepareData(data):
# create boolean gender column
data['Gender'] = data['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
# median age for each gender (rows) and each class (columns)
median_ages = np.zeros((2,3))
for i in range(0, 2):
for j in range(0, 3):
median_ages[i,j] = data[(data['Gender'] == i) & \
(data['Pclass'] == j+1)]['Age'].dropna().median()
# lets add some new useful columns
data['AgeFill'] = data['Age']
# fill in median ages
for i in range(0, 2):
for j in range(0, 3):
data.loc[ (data.Age.isnull()) & (data.Gender == i) & (data.Pclass == j+1),\
'AgeFill'] = median_ages[i,j]
data['AgeIsNull'] = pd.isnull(data.Age).astype(int)
data['FamilySize'] = data['SibSp'] + data['Parch']
data['Age*Class'] = data.AgeFill * data.Pclass
data.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked', 'Age'], axis=1, inplace=True)
In [13]:
train_df = pd.read_csv('data/train.csv', header = 0)
prepareData(train_df)
test_df = pd.read_csv('data/test.csv', header = 0)
prepareData(test_df)
In [14]:
histogram = train_df['AgeFill'].hist()
histogram.set_title('Age distribution')
Out[14]:
In [15]:
train_df.head(10)
Out[15]:
In [41]:
import warnings
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier
# disable nonfatal warnings
warnings.filterwarnings('ignore')
train_data = train_df.drop('PassengerId', axis=1).values
test_data = test_df.drop('PassengerId', axis=1).dropna().values
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)
# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])
# Take the same decision trees and run it on the test data
output = forest.predict(test_data).astype(int)
ids = test_df['PassengerId'].values
results = pd.DataFrame(list(zip(ids, output)), columns=['PassengerId', 'WillSurvive'])
results.hist()
Out[41]:
In [63]:
results.merge(test_df, on="PassengerId")[results['WillSurvive'] == 1][['AgeFill', 'Pclass', 'FamilySize']].hist()
Out[63]:
In [ ]: