In [141]:
import pandas as pd
import numpy as np
import pylab as P
# Import the random forest package
from sklearn.ensemble import RandomForestClassifier
# For .read_csv, always use header=0 when you know row 0 is the header row
df = pd.read_csv('data/titanic-kaggle/train.csv', header=0)
df.head()
Out[141]:
In [142]:
df.dtypes
Out[142]:
In [143]:
df.info()
In [144]:
df.describe()
Out[144]:
In [145]:
df['Gender'] = df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
df.head()
Out[145]:
In [146]:
df['Age'].dropna().hist(bins=16, range=(0,80), alpha = .5)
P.show()
In [147]:
median_ages = np.zeros((2,3))
median_ages
Out[147]:
In [148]:
for i in range(0, 2):
for j in range(0, 3):
median_ages[i,j] = df[(df['Gender'] == i) & \
(df['Pclass'] == j+1)]['Age'].dropna().median()
median_ages
Out[148]:
In [149]:
df['AgeFill'] = df['Age']
df.head()
Out[149]:
In [150]:
df[df['Age'].isnull()][['Gender','Pclass','Age','AgeFill']].head(10)
Out[150]:
In [151]:
for i in range(0, 2):
for j in range(0, 3):
df.loc[ (df.Age.isnull()) & (df.Gender == i) & (df.Pclass == j+1),\
'AgeFill'] = median_ages[i,j]
df[ df['Age'].isnull() ][['Gender','Pclass','Age','AgeFill']].head(10)
Out[151]:
In [152]:
df['AgeIsNull'] = pd.isnull(df.Age).astype(int)
df.describe()
Out[152]:
In [153]:
df[df['Embarked'].isnull()][['Fare','Pclass','AgeFill','Embarked']].head(10)
Out[153]:
In [154]:
df['Port'] = df['Embarked']
df.loc[df.Embarked.isnull(), 'Port'] = ['S', 'S']
df['Port'] = df['Port'].map( {'S': 1, 'Q': 2, 'C': 3} ).astype(int)
df.head()
Out[154]:
In [155]:
df['FamilySize'] = df['SibSp'] + df['Parch']
In [156]:
df['Age*Class'] = df.AgeFill * df.Pclass
df.describe()
Out[156]:
In [157]:
df['Age*Class'].hist(bins=16, alpha=.5)
P.show()
In [158]:
df.dtypes[df.dtypes.map(lambda x: x=='object')]
Out[158]:
In [159]:
df = df.drop(['Name', 'Sex', 'Age', 'Ticket', 'Cabin', 'Embarked'], axis=1)
df.info()
In [161]:
df.describe()
Out[161]:
In [160]:
train_data = df.values
train_data
Out[160]:
In [163]:
df_test = pd.read_csv('data/titanic-kaggle/test.csv', header=0)
df_test.describe()
Out[163]:
In [ ]:
df_test['Gender'] = df_test['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
In [ ]:
# Create the random forest object which will include all the parameters
# for the fit
forest = RandomForestClassifier(n_estimators = 100)
# Fit the training data to the Survived labels and create the decision trees
forest = forest.fit(train_data[0::,1::],train_data[0::,0])
# Take the same decision trees and run it on the test data
output = forest.predict(test_data)