notebook.community

Edit and run



In [53]:

    
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cross_validation import cross_val_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams["figure.dpi"] = 200
plt.rcParams["figure.figsize"] = (5 * (1 + np.sqrt(5)), 10)



In [64]:

    
train = pd.read_csv("train.csv")  #Model train data
test = pd.read_csv("test.csv")  #Model test data

# Remaps Sex columns to binary map
genderDic = {'female': 0, 'male': 
             1}
train['Sex'] = train['Sex'].map(genderDic).astype(int)
test['Sex'] = test['Sex'].map(genderDic).astype(int)

pclasses = train.Pclass.value_counts().index.tolist()  #List with all the Pclasses, in this case [1, 2, 3], sort by counts

# Sets the null ages with the mean age of the Pclass and the gender

for sex in genderDic.values():  #0 or 1
    for pclass in pclasses:  #1, 2, 3
        mean = train[(train.Sex == sex) & (train.Pclass == pclass)].Age.dropna().median()
        train.loc[(train.Age.isnull()) & (train.Sex == sex) & (train.Pclass == pclass), 'Age'] = mean
        mean = test[(test.Sex == sex) & (test.Pclass == pclass)].Age.dropna().median()
        test.loc[(test.Age.isnull()) & (test.Sex == sex) & (test.Pclass == pclass), "Age"] = mean


if len(test["Fare"][test["Fare"].isnull()]) > 0:
    median_fare = np.zeros(3)
    for f in pclasses:
        median_fare = test[test.Pclass == f]['Fare'].dropna().median()
        test.loc[(test.Fare.isnull()) & (test.Pclass == f), 'Fare'] = median_fare



In [3]:

    
#We plot the mean of survives by age
average_age = train[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
average_age["Age"] = average_age["Age"].astype(int)
sns.barplot(x='Age', y='Survived', data=average_age, ci=None);



In [70]:

    
# Gets the proportion of the sample size which have Cabin entry not null 
#(created in this particular cell), group by Survivor
# Shows if the Cabin information is relevant for an analysis, 
# or insted is biased by the take of data (in this case, of the survivors vs the manifest of the ship)
# If the result is near 1 is likely that the manifest has no ubication in the ship of the passangers.
cabin = train[["Survived", "Cabin"]].copy()
cabin["CabinIsNull"] = cabin["Cabin"].isnull().astype(int)
cabin[["Survived","CabinIsNull"]].groupby("Survived").mean()

#The result shows that the Cabin information is not meaningfull, biased by the data of the survivors.









    Out[70]:






  
    
      
      CabinIsNull
    
    
      Survived
      
    
  
  
    
      0
      0.876138
    
    
      1
      0.602339



In [4]:

    
#In this section, we search the chance of survival versus the family size
family = train[["Survived", "SibSp", "Parch", "Sex"]].copy()
family['FamilySize'] = family['SibSp'] + family['Parch']
family["Alone"] = family["FamilySize"].map(lambda x: x != 0).astype(int)


sns.factorplot(x="Alone", y="Survived", col="Sex", data=family, kind="bar")
sns.factorplot(x="FamilySize", y="Survived", col="Sex", data=family, kind="bar", ci=90, n_boot=100)
sns.factorplot(x="SibSp", y="Survived", col="Sex", data=family, kind="bar")









    Out[4]:





<seaborn.axisgrid.FacetGrid at 0x1d3725d97b8>



In [65]:

    
# This is the actual modeling
# In this case the classifier used is a Random Forest, 
# which tries to capture the relations of the data non in parametric ways
# with a number of decision forests (a number that can be changed)

# Features to be droped
dropFeatures = ["Ticket", "Name", "SibSp", "Parch", "Embarked", "Cabin"]

X = train.drop(dropFeatures, axis=1).iloc[:,2:]
sns.pairplot(train.drop(dropFeatures, axis=1).iloc[:,1:],hue="Survived")
Xtest = test.drop(dropFeatures, axis=1).iloc[:,1:]
y = train["Survived"]



In [72]:

    
rndF = RandomForestClassifier(max_depth=20, n_estimators=100, max_features="sqrt", n_jobs=-1, random_state = 7463, 
                               oob_score = True)
rndF.fit(X, y)
scores = cross_val_score(rndF, X, y, cv=100, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())
sns.barplot(rndF.feature_importances_, X.columns.values)
rndF.n_classes_









    



0.827 0.124754128559






    Out[72]:





2



In [49]:

    
clf = make_pipeline(StandardScaler(), SVC(C=1.4))
clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=30, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())









    



0.726984126984 0.148402087729



In [66]:

    
adaClf = AdaBoostClassifier()
adaClf.fit(X,y)
scores = cross_val_score(adaClf, X, y, cv=100, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())









    



0.807027777778 0.134540516065



In [54]:

    
nbClf = GaussianNB()
nbClf.fit(X,y)
scores = cross_val_score(nbClf, X, y, cv=30, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())









    



0.742857142857 0.18053418677



In [69]:

    
Ypred = rndF.predict(Xtest)
submission = pd.DataFrame({"PassengerId": test["PassengerId"],"Survived": Ypred})
submission.to_csv('submission.csv', index=False)
submission









    Out[69]:






  
    
      
      PassengerId
      Survived
    
  
  
    
      0
      892
      0
    
    
      1
      893
      0
    
    
      2
      894
      1
    
    
      3
      895
      1
    
    
      4
      896
      0
    
    
      5
      897
      0
    
    
      6
      898
      0
    
    
      7
      899
      0
    
    
      8
      900
      1
    
    
      9
      901
      0
    
    
      10
      902
      0
    
    
      11
      903
      0
    
    
      12
      904
      1
    
    
      13
      905
      0
    
    
      14
      906
      1
    
    
      15
      907
      1
    
    
      16
      908
      0
    
    
      17
      909
      1
    
    
      18
      910
      1
    
    
      19
      911
      0
    
    
      20
      912
      0
    
    
      21
      913
      0
    
    
      22
      914
      1
    
    
      23
      915
      0
    
    
      24
      916
      1
    
    
      25
      917
      0
    
    
      26
      918
      1
    
    
      27
      919
      1
    
    
      28
      920
      1
    
    
      29
      921
      0
    
    
      ...
      ...
      ...
    
    
      388
      1280
      0
    
    
      389
      1281
      1
    
    
      390
      1282
      0
    
    
      391
      1283
      1
    
    
      392
      1284
      0
    
    
      393
      1285
      0
    
    
      394
      1286
      0
    
    
      395
      1287
      1
    
    
      396
      1288
      0
    
    
      397
      1289
      1
    
    
      398
      1290
      0
    
    
      399
      1291
      0
    
    
      400
      1292
      1
    
    
      401
      1293
      0
    
    
      402
      1294
      1
    
    
      403
      1295
      0
    
    
      404
      1296
      0
    
    
      405
      1297
      0
    
    
      406
      1298
      0
    
    
      407
      1299
      0
    
    
      408
      1300
      1
    
    
      409
      1301
      1
    
    
      410
      1302
      1
    
    
      411
      1303
      1
    
    
      412
      1304
      0
    
    
      413
      1305
      0
    
    
      414
      1306
      1
    
    
      415
      1307
      0
    
    
      416
      1308
      0
    
    
      417
      1309
      0
    
  

418 rows × 2 columns



In [ ]:

	PassengerId	Survived
0	892	0
1	893	0
2	894	1
3	895	1
4	896	0
5	897	0
6	898	0
7	899	0
8	900	1
9	901	0
10	902	0
11	903	0
12	904	1
13	905	0
14	906	1
15	907	1
16	908	0
17	909	1
18	910	1
19	911	0
20	912	0
21	913	0
22	914	1
23	915	0
24	916	1
25	917	0
26	918	1
27	919	1
28	920	1
29	921	0
...	...	...
388	1280	0
389	1281	1
390	1282	0
391	1283	1
392	1284	0
393	1285	0
394	1286	0
395	1287	1
396	1288	0
397	1289	1
398	1290	0
399	1291	0
400	1292	1
401	1293	0
402	1294	1
403	1295	0
404	1296	0
405	1297	0
406	1298	0
407	1299	0
408	1300	1
409	1301	1
410	1302	1
411	1303	1
412	1304	0
413	1305	0
414	1306	1
415	1307	0
416	1308	0
417	1309	0