In [235]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
import random
In [136]:
test=pd.read_csv("test.csv")
test.head()
Out[136]:
In [137]:
mData=pd.read_csv("train.csv")
mData.head()
Out[137]:
In [138]:
mData = mData.drop(["PassengerId","Name","Ticket"],axis=1)
test = test.drop(["Name","Ticket"],axis=1)
In [139]:
mData.head()
Out[139]:
In [140]:
# Family
# Instead of having two columns Parch & SibSp,
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
mData['Family'] = mData['Parch'] + mData['SibSp']
mData['Family'].loc[mData['Family'] > 0] = 1
mData['Family'].loc[mData['Family'] == 0] = 0
test['Family'] = test['Parch'] + test['SibSp']
test['Family'].loc[test['Family'] > 0] = 1
test['Family'].loc[test['Family'] == 0] = 0
# drop Parch & SibSp
mData = mData.drop(['SibSp','Parch'], axis=1)
test = test.drop(['SibSp','Parch'], axis=1)
In [141]:
mData.head()
Out[141]:
In [142]:
# Sex
# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
age,sex = passenger
return 'child' if age < 16 else sex
mData['Person'] = mData[['Age','Sex']].apply(get_person,axis=1)
test['Person'] = test[['Age','Sex']].apply(get_person,axis=1)
# No need to use Sex column since we created Person column
mData.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)
In [143]:
mData[:10]
Out[143]:
As with all the important questions in life, this is best deferred to the Wikipedia page. A random forest is an ensemble of decision trees which will output a prediction value, in this case survival. Each decision tree is constructed by using a random subset of the training data. After you have trained your forest, you can then pass each test row through it, in order to output a prediction. Simple! Well not quite! This particular python function requires floats for the input variables, so all strings need to be converted, and any missing data needs to be filled.
In [144]:
import random
def PersonFunc(x):
if x=="male":
return 2
elif x=="female":
return 1
elif x=="child":
return 0
else:
return x
def EmbarkedFunc(x):
if x=="S":
return 1
elif x=="C":
return 2
elif x=="Q":
return 3
elif np.isnan(x):
return random.choice([1,2,3])
else:
return x
mData["Person"] = mData["Person"].apply(PersonFunc)
test["Person"] = test["Person"].apply(PersonFunc)
mData["Embarked"] = mData["Embarked"].apply(EmbarkedFunc)
test["Embarked"] = test["Embarked"].apply(EmbarkedFunc)
In [145]:
mData[:8]
Out[145]:
In [146]:
print "Nan Entries in Cabin :",mData["Cabin"].isnull().sum()
def CabinFun(x):
if type(x)==str:
return ord(x[0])-ord('A')
elif np.isnan(x):
return 0
else:
return x
mData["Cabin"] = mData["Cabin"].apply(CabinFun)
test["Cabin"] = test["Cabin"].apply(CabinFun)
In [147]:
mData.head()
Out[147]:
In [148]:
avgAge=mData["Age"].mean()
stander_dev=avgAge=mData["Age"].std()
avgAge_test = test["Age"].mean()
satnder_dev_test = test["Age"].std()
#select Randint between avg-std,avg+std
train_replacement = np.random.randint(avgAge-stander_dev,avgAge+stander_dev, size=mData["Age"].isnull().sum())
test_replacement = np.random.randint(avgAge_test-satnder_dev_test,avgAge_test+satnder_dev_test,size=test["Age"].isnull().sum())
In [149]:
train_replacement
Out[149]:
In [150]:
mData["Age"][np.isnan(mData["Age"])] = train_replacement
test["Age"][np.isnan(test["Age"])] = test_replacement
In [206]:
mData.head()
Out[206]:
In [254]:
train_data = mData.values[0::,1::] #remove label i.e Survived col [start with zero row select all row
# & start from 1dt coloumn select all coloumn except 0th]
train_label = mData.values[0::,0] #select all row & 0th coloumn
test_data=test.values #We will remove PassangerID when required i.e During prediction
In [255]:
train_data=normalize(train_data, norm='l2', axis=1, copy=True)
n_test_data=normalize(test_data, norm='l2', axis=1, copy=True)
In [256]:
clsf = RandomForestClassifier(n_estimators=100)
In [257]:
clsf=clsf.fit(train_data,train_label)
In [258]:
pred_train=clsf.predict(train_data)
print "Accuracy Score for trainig Data :",accuracy_score(pred_train,train_label)*100,"%"
In [259]:
pred_test=clsf.predict(n_test_data[0::,1::])
In [260]:
#Convert to dataframe
result=pd.DataFrame({"PassengerId" :test_data[0::,0],"Survived" : pred_test})
result["Survived"] = result["Survived"].astype(int)
result["PassengerId"] = result["PassengerId"].astype(int)
In [263]:
result
Out[263]:
In [264]:
result.to_csv("Result.csv")
print "Exported"
In [ ]: