Shelter Animal Outcomes 9

AdaBoost


In [1]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn import cross_validation
from sklearn.feature_selection import RFECV
from sklearn.grid_search import GridSearchCV
from sklearn.pipeline import Pipeline
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd

In [2]:
df_train = pd.read_csv('../Shelter_train.csv')
df_test = pd.read_csv('../Shelter_test.csv')

In [3]:
X = df_train.ix[:, :-1]
y = df_train.ix[:, -1]
df_test = df_test.drop('ID', 1)

In [4]:
clf = AdaBoostClassifier()
cross_validation.cross_val_score(clf, X, y, scoring="log_loss")


Out[4]:
array([-1.56608423, -1.565736  , -1.56166871])

In [5]:
params = {"clf__learning_rate" : [0.01, 0.03, 0.1, 0.3, 1.0, 3.0],
         "clf__algorithm" : ["SAMME", "SAMME.R"],
         "clf__n_estimators" : [10, 30, 100, 300, 1000]}

In [6]:
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

In [7]:
pipeline = Pipeline([
        ('featureSelection', RFECV(estimator=AdaBoostClassifier(n_estimators=50), scoring='log_loss')),
        ('clf', AdaBoostClassifier(n_estimators=1000))
        ])
grid_search = GridSearchCV(pipeline, params, n_jobs=-1, scoring='log_loss')
start = time()
grid_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(grid_search.grid_scores_)))
report(grid_search.grid_scores_)
predictions = grid_search.predict_proba(df_test)
output = pd.DataFrame(predictions, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
output.index.names = ['ID']
output.index += 1
output.head()


GridSearchCV took 1276.17 seconds for 60 candidate parameter settings.
Model with rank: 1
Mean validation score: -1.045 (std: 0.005)
Parameters: {'clf__learning_rate': 0.01, 'clf__algorithm': 'SAMME.R', 'clf__n_estimators': 10}

Model with rank: 2
Mean validation score: -1.050 (std: 0.004)
Parameters: {'clf__learning_rate': 0.03, 'clf__algorithm': 'SAMME.R', 'clf__n_estimators': 10}

Model with rank: 3
Mean validation score: -1.051 (std: 0.004)
Parameters: {'clf__learning_rate': 0.01, 'clf__algorithm': 'SAMME.R', 'clf__n_estimators': 30}

Out[7]:
Adoption Died Euthanasia Return_to_owner Transfer
1 0.048811 0.022419 0.126710 0.103214 0.698846
2 0.546901 0.002489 0.034359 0.218970 0.197280
3 0.546901 0.002489 0.034359 0.218970 0.197280
4 0.048811 0.022419 0.126710 0.103214 0.698846
5 0.546901 0.002489 0.034359 0.218970 0.197280

In [8]:
output.to_csv('../submission-AdaBoostClassifier.3.0.csv', index_label = 'ID')