In [53]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.cross_validation import cross_val_score
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
plt.rcParams["figure.dpi"] = 200
plt.rcParams["figure.figsize"] = (5 * (1 + np.sqrt(5)), 10)

In [64]:
train = pd.read_csv("train.csv")  #Model train data
test = pd.read_csv("test.csv")  #Model test data

# Remaps Sex columns to binary map
genderDic = {'female': 0, 'male': 
             1}
train['Sex'] = train['Sex'].map(genderDic).astype(int)
test['Sex'] = test['Sex'].map(genderDic).astype(int)

pclasses = train.Pclass.value_counts().index.tolist()  #List with all the Pclasses, in this case [1, 2, 3], sort by counts

# Sets the null ages with the mean age of the Pclass and the gender

for sex in genderDic.values():  #0 or 1
    for pclass in pclasses:  #1, 2, 3
        mean = train[(train.Sex == sex) & (train.Pclass == pclass)].Age.dropna().median()
        train.loc[(train.Age.isnull()) & (train.Sex == sex) & (train.Pclass == pclass), 'Age'] = mean
        mean = test[(test.Sex == sex) & (test.Pclass == pclass)].Age.dropna().median()
        test.loc[(test.Age.isnull()) & (test.Sex == sex) & (test.Pclass == pclass), "Age"] = mean


if len(test["Fare"][test["Fare"].isnull()]) > 0:
    median_fare = np.zeros(3)
    for f in pclasses:
        median_fare = test[test.Pclass == f]['Fare'].dropna().median()
        test.loc[(test.Fare.isnull()) & (test.Pclass == f), 'Fare'] = median_fare

In [3]:
#We plot the mean of survives by age
average_age = train[["Age", "Survived"]].groupby(['Age'],as_index=False).mean()
average_age["Age"] = average_age["Age"].astype(int)
sns.barplot(x='Age', y='Survived', data=average_age, ci=None);



In [70]:
# Gets the proportion of the sample size which have Cabin entry not null 
#(created in this particular cell), group by Survivor
# Shows if the Cabin information is relevant for an analysis, 
# or insted is biased by the take of data (in this case, of the survivors vs the manifest of the ship)
# If the result is near 1 is likely that the manifest has no ubication in the ship of the passangers.
cabin = train[["Survived", "Cabin"]].copy()
cabin["CabinIsNull"] = cabin["Cabin"].isnull().astype(int)
cabin[["Survived","CabinIsNull"]].groupby("Survived").mean()

#The result shows that the Cabin information is not meaningfull, biased by the data of the survivors.


Out[70]:
CabinIsNull
Survived
0 0.876138
1 0.602339

In [4]:
#In this section, we search the chance of survival versus the family size
family = train[["Survived", "SibSp", "Parch", "Sex"]].copy()
family['FamilySize'] = family['SibSp'] + family['Parch']
family["Alone"] = family["FamilySize"].map(lambda x: x != 0).astype(int)


sns.factorplot(x="Alone", y="Survived", col="Sex", data=family, kind="bar")
sns.factorplot(x="FamilySize", y="Survived", col="Sex", data=family, kind="bar", ci=90, n_boot=100)
sns.factorplot(x="SibSp", y="Survived", col="Sex", data=family, kind="bar")


Out[4]:
<seaborn.axisgrid.FacetGrid at 0x1d3725d97b8>

In [65]:
# This is the actual modeling
# In this case the classifier used is a Random Forest, 
# which tries to capture the relations of the data non in parametric ways
# with a number of decision forests (a number that can be changed)

# Features to be droped
dropFeatures = ["Ticket", "Name", "SibSp", "Parch", "Embarked", "Cabin"]

X = train.drop(dropFeatures, axis=1).iloc[:,2:]
sns.pairplot(train.drop(dropFeatures, axis=1).iloc[:,1:],hue="Survived")
Xtest = test.drop(dropFeatures, axis=1).iloc[:,1:]
y = train["Survived"]



In [72]:
rndF = RandomForestClassifier(max_depth=20, n_estimators=100, max_features="sqrt", n_jobs=-1, random_state = 7463, 
                               oob_score = True)
rndF.fit(X, y)
scores = cross_val_score(rndF, X, y, cv=100, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())
sns.barplot(rndF.feature_importances_, X.columns.values)
rndF.n_classes_


0.827 0.124754128559
Out[72]:
2

In [49]:
clf = make_pipeline(StandardScaler(), SVC(C=1.4))
clf.fit(X, y)
scores = cross_val_score(clf, X, y, cv=30, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())


0.726984126984 0.148402087729

In [66]:
adaClf = AdaBoostClassifier()
adaClf.fit(X,y)
scores = cross_val_score(adaClf, X, y, cv=100, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())


0.807027777778 0.134540516065

In [54]:
nbClf = GaussianNB()
nbClf.fit(X,y)
scores = cross_val_score(nbClf, X, y, cv=30, n_jobs=-1, scoring='accuracy')
print(scores.mean(), scores.std())


0.742857142857 0.18053418677

In [69]:
Ypred = rndF.predict(Xtest)
submission = pd.DataFrame({"PassengerId": test["PassengerId"],"Survived": Ypred})
submission.to_csv('submission.csv', index=False)
submission


Out[69]:
PassengerId Survived
0 892 0
1 893 0
2 894 1
3 895 1
4 896 0
5 897 0
6 898 0
7 899 0
8 900 1
9 901 0
10 902 0
11 903 0
12 904 1
13 905 0
14 906 1
15 907 1
16 908 0
17 909 1
18 910 1
19 911 0
20 912 0
21 913 0
22 914 1
23 915 0
24 916 1
25 917 0
26 918 1
27 919 1
28 920 1
29 921 0
... ... ...
388 1280 0
389 1281 1
390 1282 0
391 1283 1
392 1284 0
393 1285 0
394 1286 0
395 1287 1
396 1288 0
397 1289 1
398 1290 0
399 1291 0
400 1292 1
401 1293 0
402 1294 1
403 1295 0
404 1296 0
405 1297 0
406 1298 0
407 1299 0
408 1300 1
409 1301 1
410 1302 1
411 1303 1
412 1304 0
413 1305 0
414 1306 1
415 1307 0
416 1308 0
417 1309 0

418 rows × 2 columns


In [ ]: