Predicting Life



In [235]:

    
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import normalize
import random



In [136]:

    
test=pd.read_csv("test.csv")
test.head()









    Out[136]:






  
    
      
      PassengerId
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      892
      3
      Kelly, Mr. James
      male
      34.5
      0
      0
      330911
      7.8292
      NaN
      Q
    
    
      1
      893
      3
      Wilkes, Mrs. James (Ellen Needs)
      female
      47.0
      1
      0
      363272
      7.0000
      NaN
      S
    
    
      2
      894
      2
      Myles, Mr. Thomas Francis
      male
      62.0
      0
      0
      240276
      9.6875
      NaN
      Q
    
    
      3
      895
      3
      Wirz, Mr. Albert
      male
      27.0
      0
      0
      315154
      8.6625
      NaN
      S
    
    
      4
      896
      3
      Hirvonen, Mrs. Alexander (Helga E Lindqvist)
      female
      22.0
      1
      1
      3101298
      12.2875
      NaN
      S



In [137]:

    
mData=pd.read_csv("train.csv")
mData.head()









    Out[137]:






  
    
      
      PassengerId
      Survived
      Pclass
      Name
      Sex
      Age
      SibSp
      Parch
      Ticket
      Fare
      Cabin
      Embarked
    
  
  
    
      0
      1
      0
      3
      Braund, Mr. Owen Harris
      male
      22
      1
      0
      A/5 21171
      7.2500
      NaN
      S
    
    
      1
      2
      1
      1
      Cumings, Mrs. John Bradley (Florence Briggs Th...
      female
      38
      1
      0
      PC 17599
      71.2833
      C85
      C
    
    
      2
      3
      1
      3
      Heikkinen, Miss. Laina
      female
      26
      0
      0
      STON/O2. 3101282
      7.9250
      NaN
      S
    
    
      3
      4
      1
      1
      Futrelle, Mrs. Jacques Heath (Lily May Peel)
      female
      35
      1
      0
      113803
      53.1000
      C123
      S
    
    
      4
      5
      0
      3
      Allen, Mr. William Henry
      male
      35
      0
      0
      373450
      8.0500
      NaN
      S



In [138]:

    
mData = mData.drop(["PassengerId","Name","Ticket"],axis=1)
test = test.drop(["Name","Ticket"],axis=1)



In [139]:

    
mData.head()



In [140]:

    
# Family

# Instead of having two columns Parch & SibSp, 
# we can have only one column represent if the passenger had any family member aboard or not,
# Meaning, if having any family member(whether parent, brother, ...etc) will increase chances of Survival or not.
mData['Family'] =  mData['Parch'] + mData['SibSp']
mData['Family'].loc[mData['Family'] > 0] = 1
mData['Family'].loc[mData['Family'] == 0] = 0

test['Family'] =  test['Parch'] + test['SibSp']
test['Family'].loc[test['Family'] > 0] = 1
test['Family'].loc[test['Family'] == 0] = 0

# drop Parch & SibSp
mData = mData.drop(['SibSp','Parch'], axis=1)
test    = test.drop(['SibSp','Parch'], axis=1)



In [141]:

    
mData.head()



In [142]:

    
# Sex
# As we see, children(age < ~16) on aboard seem to have a high chances for Survival.
# So, we can classify passengers as males, females, and child
def get_person(passenger):
    age,sex = passenger
    return 'child' if age < 16 else sex
    
mData['Person'] = mData[['Age','Sex']].apply(get_person,axis=1)
test['Person']    = test[['Age','Sex']].apply(get_person,axis=1)

# No need to use Sex column since we created Person column
mData.drop(['Sex'],axis=1,inplace=True)
test.drop(['Sex'],axis=1,inplace=True)



In [143]:

    
mData[:10]

What is a Random Forest?

As with all the important questions in life, this is best deferred to the Wikipedia page. A random forest is an ensemble of decision trees which will output a prediction value, in this case survival. Each decision tree is constructed by using a random subset of the training data. After you have trained your forest, you can then pass each test row through it, in order to output a prediction. Simple! Well not quite! This particular python function requires floats for the input variables, so all strings need to be converted, and any missing data needs to be filled.

Replace Male=2 female=1 & child=0



In [144]:

    
import random
def PersonFunc(x):
    if x=="male":
        return 2
    elif x=="female":
        return 1
    elif x=="child":
        return 0
    else:
        return x

def EmbarkedFunc(x):
    if x=="S":
        return 1
    elif x=="C":
        return 2
    elif x=="Q":
        return 3
    elif np.isnan(x):
        return random.choice([1,2,3])
    else:
        return x
    
mData["Person"]  = mData["Person"].apply(PersonFunc)
test["Person"]  = test["Person"].apply(PersonFunc)

mData["Embarked"]  = mData["Embarked"].apply(EmbarkedFunc)
test["Embarked"]  = test["Embarked"].apply(EmbarkedFunc)



In [145]:

    
mData[:8]



In [146]:

    
print "Nan Entries in Cabin :",mData["Cabin"].isnull().sum()

def CabinFun(x):
    if type(x)==str:
        return ord(x[0])-ord('A')
    elif np.isnan(x):
        return 0
    else:
        return x

    
mData["Cabin"]  = mData["Cabin"].apply(CabinFun)
test["Cabin"]  = test["Cabin"].apply(CabinFun)









    



Nan Entries in Cabin : 687



In [147]:

    
mData.head()



In [148]:

    
avgAge=mData["Age"].mean()
stander_dev=avgAge=mData["Age"].std()

avgAge_test = test["Age"].mean()
satnder_dev_test = test["Age"].std()

#select Randint between avg-std,avg+std
train_replacement = np.random.randint(avgAge-stander_dev,avgAge+stander_dev, size=mData["Age"].isnull().sum())
test_replacement = np.random.randint(avgAge_test-satnder_dev_test,avgAge_test+satnder_dev_test,size=test["Age"].isnull().sum())



In [149]:

    
train_replacement









    Out[149]:





array([18, 19, 25,  6, 27,  3,  0, 28, 16,  6, 26, 24, 15, 19, 22, 17, 17,
       19, 14, 13, 15, 28, 24,  4,  5,  4, 17,  9,  3, 22, 15, 22, 19, 23,
       26, 10,  2, 20,  5, 28,  9,  4, 12, 10, 16,  6, 11, 23, 21, 10,  0,
       14, 18,  3,  1,  7, 19,  2, 27,  4, 14, 16,  6, 15, 11, 23, 20, 24,
        6,  4,  0, 25,  3,  9, 16, 28, 22,  7, 22, 26, 27,  7,  9, 14, 10,
       18,  7, 16, 11,  1, 18,  6,  9, 20,  3, 18,  5, 24,  4,  2, 25,  4,
        7, 27, 25, 18, 13, 23, 12,  0,  2,  0, 14, 12,  2, 24,  9, 26, 23,
       15, 11, 16, 21,  1,  7,  6, 13,  1, 11, 25, 22,  5,  6, 11, 12,  9,
       18, 23,  5,  5, 27,  8,  6, 25,  0, 16, 24, 17, 17, 24, 11, 24, 28,
       14,  3, 12, 19,  2, 21, 18,  3,  8, 23, 20, 23, 10, 18, 12,  3, 16,
       27,  8, 28, 13, 22, 14, 15])



In [150]:

    
mData["Age"][np.isnan(mData["Age"])] = train_replacement
test["Age"][np.isnan(test["Age"])] = test_replacement









    



c:\python27\lib\site-packages\ipykernel\__main__.py:1: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
c:\python27\lib\site-packages\ipykernel\__main__.py:2: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app



In [206]:

    
mData.head()

Decision Tree Classification



In [254]:

    
train_data = mData.values[0::,1::] #remove label i.e Survived col [start with zero row select all row
                                    # & start from 1dt coloumn select all coloumn except 0th] 
train_label = mData.values[0::,0]   #select all row & 0th coloumn             
test_data=test.values #We will remove PassangerID when required i.e During prediction



In [255]:

    
train_data=normalize(train_data, norm='l2', axis=1, copy=True)
n_test_data=normalize(test_data, norm='l2', axis=1, copy=True)



In [256]:

    
clsf = RandomForestClassifier(n_estimators=100)



In [257]:

    
clsf=clsf.fit(train_data,train_label)



In [258]:

    
pred_train=clsf.predict(train_data)
print "Accuracy Score for trainig Data :",accuracy_score(pred_train,train_label)*100,"%"









    



Accuracy Score for trainig Data : 98.9898989899 %



In [259]:

    
pred_test=clsf.predict(n_test_data[0::,1::])



In [260]:

    
#Convert to dataframe
result=pd.DataFrame({"PassengerId" :test_data[0::,0],"Survived" : pred_test})

result["Survived"] = result["Survived"].astype(int)
result["PassengerId"] = result["PassengerId"].astype(int)



In [263]:

    
result









    Out[263]:






  
    
      
      PassangerId
      Survival
    
  
  
    
      0
      892
      1
    
    
      1
      893
      1
    
    
      2
      894
      1
    
    
      3
      895
      1
    
    
      4
      896
      1
    
    
      5
      897
      1
    
    
      6
      898
      1
    
    
      7
      899
      1
    
    
      8
      900
      1
    
    
      9
      901
      1
    
    
      10
      902
      1
    
    
      11
      903
      1
    
    
      12
      904
      1
    
    
      13
      905
      1
    
    
      14
      906
      1
    
    
      15
      907
      1
    
    
      16
      908
      1
    
    
      17
      909
      1
    
    
      18
      910
      1
    
    
      19
      911
      1
    
    
      20
      912
      1
    
    
      21
      913
      1
    
    
      22
      914
      1
    
    
      23
      915
      1
    
    
      24
      916
      1
    
    
      25
      917
      1
    
    
      26
      918
      1
    
    
      27
      919
      1
    
    
      28
      920
      1
    
    
      29
      921
      1
    
    
      ...
      ...
      ...
    
    
      388
      1280
      1
    
    
      389
      1281
      1
    
    
      390
      1282
      1
    
    
      391
      1283
      1
    
    
      392
      1284
      1
    
    
      393
      1285
      1
    
    
      394
      1286
      1
    
    
      395
      1287
      0
    
    
      396
      1288
      1
    
    
      397
      1289
      1
    
    
      398
      1290
      1
    
    
      399
      1291
      1
    
    
      400
      1292
      0
    
    
      401
      1293
      1
    
    
      402
      1294
      1
    
    
      403
      1295
      1
    
    
      404
      1296
      1
    
    
      405
      1297
      0
    
    
      406
      1298
      1
    
    
      407
      1299
      1
    
    
      408
      1300
      1
    
    
      409
      1301
      1
    
    
      410
      1302
      1
    
    
      411
      1303
      1
    
    
      412
      1304
      1
    
    
      413
      1305
      1
    
    
      414
      1306
      1
    
    
      415
      1307
      1
    
    
      416
      1308
      1
    
    
      417
      1309
      1
    
  

418 rows × 2 columns

# Export Result in CSV



In [264]:

    
result.to_csv("Result.csv")
print "Exported"









    



Exported



In [ ]:

	PassengerId	Pclass	Name	Sex	Age	SibSp	Parch	Ticket	Fare	Cabin	Embarked
0	892	3	Kelly, Mr. James	male	34.5	0	0	330911	7.8292	NaN	Q
1	893	3	Wilkes, Mrs. James (Ellen Needs)	female	47.0	1	0	363272	7.0000	NaN	S
2	894	2	Myles, Mr. Thomas Francis	male	62.0	0	0	240276	9.6875	NaN	Q
3	895	3	Wirz, Mr. Albert	male	27.0	0	0	315154	8.6625	NaN	S
4	896	3	Hirvonen, Mrs. Alexander (Helga E Lindqvist)	female	22.0	1	1	3101298	12.2875	NaN	S

	PassengerId	Survived	Pclass	Name	Sex	Age	SibSp	Ticket	Fare	Cabin	Embarked
0	1	0	3	Braund, Mr. Owen Harris	male	22	1	A/5 21171	7.2500	NaN	S
1	2	1	1	Cumings, Mrs. John Bradley (Florence Briggs Th...	female	38	1	PC 17599	71.2833	C85	C
2	3	1	3	Heikkinen, Miss. Laina	female	26	0	STON/O2. 3101282	7.9250	NaN	S
3	4	1	1	Futrelle, Mrs. Jacques Heath (Lily May Peel)	female	35	1	113803	53.1000	C123	S
4	5	0	3	Allen, Mr. William Henry	male	35	0	373450	8.0500	NaN	S

	PassangerId	Survival
0	892	1
1	893	1
2	894	1
3	895	1
4	896	1
5	897	1
6	898	1
7	899	1
8	900	1
9	901	1
10	902	1
11	903	1
12	904	1
13	905	1
14	906	1
15	907	1
16	908	1
17	909	1
18	910	1
19	911	1
20	912	1
21	913	1
22	914	1
23	915	1
24	916	1
25	917	1
26	918	1
27	919	1
28	920	1
29	921	1
...	...	...
388	1280	1
389	1281	1
390	1282	1
391	1283	1
392	1284	1
393	1285	1
394	1286	1
395	1287	0
396	1288	1
397	1289	1
398	1290	1
399	1291	1
400	1292	0
401	1293	1
402	1294	1
403	1295	1
404	1296	1
405	1297	0
406	1298	1
407	1299	1
408	1300	1
409	1301	1
410	1302	1
411	1303	1
412	1304	1
413	1305	1
414	1306	1
415	1307	1
416	1308	1
417	1309	1