In [1]:
#Import pandas and scikit-learn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import ensemble, svm
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn import grid_search
from sklearn import metrics

#Plots config
%matplotlib inline
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)

In [3]:
#Add plots module to path
import sys
#sys.path.append('/Users/Edu/Development/open-source/sklearn-model-evaluation')
#import plots as p

Data loading


In [4]:
#Read the data
train = pd.read_csv("data/train.csv", index_col='id')
test = pd.read_csv('data/test.csv', index_col='id')

In [5]:
train.head()


Out[5]:
age fare parents_and_children p_class siblings_and_spouses survived fam_size name_Capt name_Col name_Don ... name_the Countess cabin_A cabin_B cabin_C cabin_D cabin_E cabin_F cabin_G cabin_T cabin_U
id
1 22 7.2500 0 3 1 0 1 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
2 38 71.2833 0 1 1 1 1 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
3 26 7.9250 0 3 0 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
4 35 53.1000 0 1 1 1 1 0 0 0 ... 0 0 0 1 0 0 0 0 0 0
5 35 8.0500 0 3 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 34 columns


In [6]:
test.head()


Out[6]:
age fare parents_and_children p_class siblings_and_spouses fam_size name_Capt name_Col name_Don name_Dona ... name_the Countess cabin_A cabin_B cabin_C cabin_D cabin_E cabin_F cabin_G cabin_T cabin_U
id
892 34.5 7.8292 0 3 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
893 47.0 7.0000 0 3 1 1 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
894 62.0 9.6875 0 2 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
895 27.0 8.6625 0 3 0 0 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1
896 22.0 12.2875 1 3 1 2 0 0 0 0 ... 0 0 0 0 0 0 0 0 0 1

5 rows × 33 columns

Model training (with cross-validation)


In [7]:
#RandomForest
rf_param_grid = [{'criterion': ['gini', 'entropy'],
                 'n_estimators': [10, 100, 1000],
                 'max_features' : ['auto', 'log2'],
                 'bootstrap' : [True, False]
                }]
rf = ensemble.RandomForestClassifier(n_jobs = -1)
rf = grid_search.GridSearchCV(rf, rf_param_grid)#, scoring=metrics.accuracy_score)

#AdaBoost
ab_param_grid = {'n_estimators': [10, 50, 100, 1000],
                }
ab = ensemble.AdaBoostClassifier(n_estimators = 50)
ab = grid_search.GridSearchCV(ab, ab_param_grid)#, scoring=metrics.accuracy_score)


#SVC
svc_param_grid = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']},
  {'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
 ]
svc = svm.SVC()
svc = grid_search.GridSearchCV(svc, svc_param_grid)#, scoring=metrics.accuracy_score)

In [9]:
train_x = train.drop(['survived'], axis=1).values
train_y = train['survived']
test_x = test.values

#SVC needs feature scaling
scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
test_x_scaled  = scaler.transform(test_x)

In [10]:
#Perform grid search
svc.fit(train_x_scaled, train_y)
svc.best_params_
#{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}


Out[10]:
{'C': 100, 'gamma': 0.01, 'kernel': 'rbf'}

In [11]:
#Perform rf grid search
rf.fit(train_x, train_y)
rf.best_params_


Out[11]:
{'bootstrap': True,
 'criterion': 'entropy',
 'max_features': 'log2',
 'n_estimators': 100}

In [12]:
ab.fit(train_x, train_y)
ab.best_params_


Out[12]:
{'n_estimators': 50}

Model evaluation


In [13]:
#svc_scores = cross_validation.cross_val_score(svc, train_x_scaled, train_y, cv=5)
#print("SVC accuracy: %0.2f (+/- %0.2f)" % (svc_scores.mean(), svc_scores.std() * 2))
#SVC accuracy: 0.83 (+/- 0.06)

In [14]:
#rf_scores  = cross_validation.cross_val_score(rf, train_x, train_y, cv=5)
#print("RandomForest accuracy: %0.2f (+/- %0.2f)" % (rf_scores.mean(), rf_scores.std() * 2))
#RandomForest accuracy: 0.79 (+/- 0.06)

In [15]:
#ab_scores  = cross_validation.cross_val_score(ab, train_x, train_y, cv=5)
#print("AdaBoost accuracy: %0.2f (+/- %0.2f)" % (ab_scores.mean(), ab_scores.std() * 2))
#AdaBoost accuracy: 0.81 (+/- 0.02)

Model evaluation plots


In [16]:
#svc_train_pred = cross_validation.cross_val_predict(svc, train_x_scaled, train_y, cv=5)

In [17]:
#p.plot_confusion_matrix(train_y, svc_train_pred, target_names=[0,1])

Predictions on test set


In [18]:
#RandonForest
rf_pred = rf.predict(test_x)
#AdaBoost
ab_pred = ab.predict(test_x)
#SVC
svc_pred = svc.predict(test_x_scaled)

Save predictions to csv


In [19]:
result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':rf_pred.astype('int')})
result.to_csv("rf_result.csv", index=False)

result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':svc_pred.astype('int')})
result.to_csv("svc_result.csv", index=False)

result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':ab_pred.astype('int')})
result.to_csv("ab_result.csv", index=False)

In [ ]:


In [ ]: