In [1]:
#Import pandas and scikit-learn
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import ensemble, svm
from sklearn import cross_validation
from sklearn import preprocessing
from sklearn import grid_search
from sklearn import metrics
#Plots config
%matplotlib inline
pd.set_option('display.mpl_style', 'default') # Make the graphs a bit prettier
plt.rcParams['figure.figsize'] = (15, 5)
In [3]:
#Add plots module to path
import sys
#sys.path.append('/Users/Edu/Development/open-source/sklearn-model-evaluation')
#import plots as p
In [4]:
#Read the data
train = pd.read_csv("data/train.csv", index_col='id')
test = pd.read_csv('data/test.csv', index_col='id')
In [5]:
train.head()
Out[5]:
In [6]:
test.head()
Out[6]:
In [7]:
#RandomForest
rf_param_grid = [{'criterion': ['gini', 'entropy'],
'n_estimators': [10, 100, 1000],
'max_features' : ['auto', 'log2'],
'bootstrap' : [True, False]
}]
rf = ensemble.RandomForestClassifier(n_jobs = -1)
rf = grid_search.GridSearchCV(rf, rf_param_grid)#, scoring=metrics.accuracy_score)
#AdaBoost
ab_param_grid = {'n_estimators': [10, 50, 100, 1000],
}
ab = ensemble.AdaBoostClassifier(n_estimators = 50)
ab = grid_search.GridSearchCV(ab, ab_param_grid)#, scoring=metrics.accuracy_score)
#SVC
svc_param_grid = [
{'C': [1, 10, 100, 1000], 'kernel': ['linear']},
{'C': [1, 10, 100, 1000], 'gamma': [0.01, 0.001, 0.0001], 'kernel': ['rbf']},
]
svc = svm.SVC()
svc = grid_search.GridSearchCV(svc, svc_param_grid)#, scoring=metrics.accuracy_score)
In [9]:
train_x = train.drop(['survived'], axis=1).values
train_y = train['survived']
test_x = test.values
#SVC needs feature scaling
scaler = preprocessing.StandardScaler().fit(train_x)
train_x_scaled = scaler.transform(train_x)
test_x_scaled = scaler.transform(test_x)
In [10]:
#Perform grid search
svc.fit(train_x_scaled, train_y)
svc.best_params_
#{'C': 1000, 'gamma': 0.001, 'kernel': 'rbf'}
Out[10]:
In [11]:
#Perform rf grid search
rf.fit(train_x, train_y)
rf.best_params_
Out[11]:
In [12]:
ab.fit(train_x, train_y)
ab.best_params_
Out[12]:
In [13]:
#svc_scores = cross_validation.cross_val_score(svc, train_x_scaled, train_y, cv=5)
#print("SVC accuracy: %0.2f (+/- %0.2f)" % (svc_scores.mean(), svc_scores.std() * 2))
#SVC accuracy: 0.83 (+/- 0.06)
In [14]:
#rf_scores = cross_validation.cross_val_score(rf, train_x, train_y, cv=5)
#print("RandomForest accuracy: %0.2f (+/- %0.2f)" % (rf_scores.mean(), rf_scores.std() * 2))
#RandomForest accuracy: 0.79 (+/- 0.06)
In [15]:
#ab_scores = cross_validation.cross_val_score(ab, train_x, train_y, cv=5)
#print("AdaBoost accuracy: %0.2f (+/- %0.2f)" % (ab_scores.mean(), ab_scores.std() * 2))
#AdaBoost accuracy: 0.81 (+/- 0.02)
In [16]:
#svc_train_pred = cross_validation.cross_val_predict(svc, train_x_scaled, train_y, cv=5)
In [17]:
#p.plot_confusion_matrix(train_y, svc_train_pred, target_names=[0,1])
In [18]:
#RandonForest
rf_pred = rf.predict(test_x)
#AdaBoost
ab_pred = ab.predict(test_x)
#SVC
svc_pred = svc.predict(test_x_scaled)
In [19]:
result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':rf_pred.astype('int')})
result.to_csv("rf_result.csv", index=False)
result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':svc_pred.astype('int')})
result.to_csv("svc_result.csv", index=False)
result = pd.DataFrame(data={'PassengerId':test.index, 'Survived':ab_pred.astype('int')})
result.to_csv("ab_result.csv", index=False)
In [ ]:
In [ ]: