See Notes at the end for features description.
In [1]:
import pandas as pd
import numpy as np
import csv as csv
from sklearn.ensemble import RandomForestClassifier
# Load the train file into a dataframe
# header=0 means the header is in the first line of the file (line = 0)
train_df = pd.read_csv('data/train.csv', header=0)
In [2]:
# female = 0, Male = 1
# Add a new column to the data file labeled "gender" that maps "Sex" into integer values.
train_df['Gender'] = train_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
In [3]:
# Embarked from 'C', 'Q', 'S'
# Note this is not ideal: in translating categories to numbers, Port "2" is not 2 times greater than Port "1", etc.
# All missing Embarked -> just make them embark from most common place (the mode of the distribution)
# if the number of null elements > 0 then drop these and replace them with the mode
if len(train_df.Embarked[ train_df.Embarked.isnull() ]) > 0:
train_df.Embarked[ train_df.Embarked.isnull() ] = train_df.Embarked.dropna().mode().values
# Map Embark to integers (similar to Gender):
Ports = list(enumerate(np.unique(train_df['Embarked']))) # determine all values of Embarked,
Ports_dict = { name : i for i, name in Ports } # set up a dictionary in the form Ports : index
train_df.Embarked = train_df.Embarked.map( lambda x: Ports_dict[x]).astype(int) # Convert all Embark strings to int
# Or: train_df['Embarked'] = train_df['Embarked'].map( {'C': 0, 'Q': 1, 'S': 2} ).astype(int)
# All the ages with no data -> make the median of all Ages
median_age = train_df['Age'].dropna().median()
if len(train_df.Age[ train_df.Age.isnull() ]) > 0:
train_df.loc[ (train_df.Age.isnull()), 'Age'] = median_age
# Remove the unused features: Name column, Cabin, Ticket, and Sex (since We copied and filled it to Gender)
train_df = train_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
#Note: Use axis=0 to apply a method down each column, axis=1 to apply a method across each row.
In [4]:
test_df = pd.read_csv('data/test.csv', header=0) # Load the test file into a dataframe
# I need to convert all strings to integer classifiers:
# female = 0, Male = 1
test_df['Gender'] = test_df['Sex'].map( {'female': 0, 'male': 1} ).astype(int)
# Embarked from 'C', 'Q', 'S'
# All missing Embarked -> just make them embark from most common place
if len(test_df.Embarked[ test_df.Embarked.isnull() ]) > 0:
test_df.Embarked[ test_df.Embarked.isnull() ] = test_df.Embarked.dropna().mode().values
# Again convert all Embarked strings to int
test_df.Embarked = test_df.Embarked.map( lambda x: Ports_dict[x]).astype(int)
# All the ages with no data -> make the median of all Ages
median_age = test_df['Age'].dropna().median()
if len(test_df.Age[ test_df.Age.isnull() ]) > 0:
test_df.loc[ (test_df.Age.isnull()), 'Age'] = median_age
# All the missing Fares -> assume median of their respective class
# Only for test data because training data is complete
if len(test_df.Fare[ test_df.Fare.isnull() ]) > 0:
median_fare = np.zeros(3)
for f in range(0,3): # loop 0 to 2
median_fare[f] = test_df[ test_df.Pclass == f+1 ]['Fare'].dropna().median()
for f in range(0,3): # loop 0 to 2
test_df.loc[ (test_df.Fare.isnull()) & (test_df.Pclass == f+1 ), 'Fare'] = median_fare[f]
In [5]:
# we need IDs only to verify our model, not as a feature
ids = test_df['PassengerId'].values
# Remove the Name column, Cabin, Ticket, and Sex (since I copied and filled it to Gender)
test_df = test_df.drop(['Name', 'Sex', 'Ticket', 'Cabin', 'PassengerId'], axis=1)
In [6]:
# Convert back to a numpy array
train_data = train_df.values
test_data = test_df.values
print 'Training...'
# Initialize the training model, all parameters are default except n_estimators.
# This is the simplest random forest you can fit.
forest = RandomForestClassifier(n_estimators=1000, max_depth=None, min_samples_split=2,
min_samples_leaf=1, bootstrap=True, oob_score=False, n_jobs=-1)
# Fit the training data (X, Y)
forest = forest.fit( train_data[0::,1::], train_data[0::,0] )
print 'Predicting...'
# Predict outcomes from the test data
output = forest.predict(test_data).astype(int)
# Write your predictions into file
predictions_file = open("myfirstforest.csv", "wb")
open_file_object = csv.writer(predictions_file)
open_file_object.writerow(["PassengerId","Survived"])
open_file_object.writerows(zip(ids, output))
predictions_file.close()
print 'Done.'
survival: Survival (0 = No; 1 = Yes)
pclass: Passenger Class (1 = 1st; 2 = 2nd; 3 = 3rd)
name: Name
sex: Sex
age: Age
sibsp: Number of Siblings/Spouses Aboard
parch: Number of Parents/Children Aboard
ticket: Ticket Number
fare: Passenger Fare
cabin: Cabin
embarked: Port of Embarkation (C = Cherbourg; Q = Queenstown; S = Southampton)
1st ~ Upper; 2nd ~ Middle; 3rd ~ Lower
If the Age is Estimated, it is in the form xx.5