Shelter Animal Outcomes 10

Gradient Tree Boosting



In [1]:

    
from sklearn.ensemble import GradientBoostingClassifier
from sklearn import cross_validation
from sklearn.feature_selection import RFECV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd



In [2]:

    
df_train = pd.read_csv('../Shelter_train.csv')
df_test = pd.read_csv('../Shelter_test.csv')



In [3]:

    
X = df_train.ix[:, :-1]
y = df_train.ix[:, -1]
df_test = df_test.drop('ID', 1)



In [4]:

    
clf = GradientBoostingClassifier()
cross_validation.cross_val_score(clf, X, y, scoring="log_loss")









    Out[4]:





array([-0.95119677, -0.94772602, -0.93670763])



In [5]:

    
params = {
          "clf__max_features": [0.1, 0.3, 1.0],
          "clf__learning_rate" : [0.01, 0.03, 0.1, 0.3],
          "clf__min_samples_split": [1, 3, 10],
          "clf__max_depth": [3, 5, 6],
          "clf__max_features": [0.1, 0.3, 1.0]
}



In [6]:

    
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")



In [7]:

    
pipeline = Pipeline([
        ('featureSelection', RFECV(estimator=GradientBoostingClassifier(n_estimators=10), scoring='log_loss')),
        ('clf', GradientBoostingClassifier(n_estimators=1000))
        ])
rand_search = RandomizedSearchCV(pipeline, params, n_iter=100, n_jobs=-1, scoring='log_loss')
start = time()
rand_search.fit(X, y)
print("RandomizedSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(rand_search.grid_scores_)))
report(rand_search.grid_scores_)
predictions = rand_search.predict_proba(df_test)
output = pd.DataFrame(predictions, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
output.index.names = ['ID']
output.index += 1
output.head()









    



RandomizedSearchCV took 9553.61 seconds for 100 candidate parameter settings.
Model with rank: 1
Mean validation score: -0.945 (std: 0.006)
Parameters: {'clf__max_features': 0.3, 'clf__max_depth': 3, 'clf__learning_rate': 0.01, 'clf__min_samples_split': 10}

Model with rank: 2
Mean validation score: -0.945 (std: 0.006)
Parameters: {'clf__max_features': 0.3, 'clf__max_depth': 3, 'clf__learning_rate': 0.01, 'clf__min_samples_split': 1}

Model with rank: 3
Mean validation score: -0.945 (std: 0.006)
Parameters: {'clf__max_features': 0.3, 'clf__max_depth': 3, 'clf__learning_rate': 0.01, 'clf__min_samples_split': 3}







    Out[7]:






  
    
      
      Adoption
      Died
      Euthanasia
      Return_to_owner
      Transfer
    
  
  
    
      1
      0.080615
      0.005951
      0.085572
      0.286303
      0.541559
    
    
      2
      0.595245
      0.001091
      0.027656
      0.254156
      0.121852
    
    
      3
      0.661075
      0.003448
      0.031854
      0.101615
      0.202007
    
    
      4
      0.111369
      0.007495
      0.095261
      0.324243
      0.461632
    
    
      5
      0.527595
      0.001488
      0.025323
      0.270525
      0.175069



In [8]:

    
output.to_csv('../submission-GradientBoostingClassifier.3.0.csv', index_label = 'ID')

	Adoption	Died	Euthanasia	Return_to_owner	Transfer
1	0.080615	0.005951	0.085572	0.286303	0.541559
2	0.595245	0.001091	0.027656	0.254156	0.121852
3	0.661075	0.003448	0.031854	0.101615	0.202007
4	0.111369	0.007495	0.095261	0.324243	0.461632
5	0.527595	0.001488	0.025323	0.270525	0.175069