Shelter Animal Outcomes 7

K Nearest Neighbors



In [1]:

    
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.grid_search import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd



In [2]:

    
df_train = pd.read_csv('../Shelter_train.csv')
df_test = pd.read_csv('../Shelter_test.csv')



In [3]:

    
X = df_train.ix[:, :-1]
y = df_train.ix[:, -1]
df_test = df_test.drop('ID', 1)



In [4]:

    
clf = KNeighborsClassifier()
cross_validation.cross_val_score(clf, X, y, scoring="log_loss")









    Out[4]:





array([-5.68140971, -5.60506317, -5.79990957])



In [5]:

    
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")



In [6]:

    
params = {
    "featureSelection__k" : [2, 3, 4, 5, 6, 7, 8],
    "clf__n_neighbors" : sp_randint(1, 30),
    "clf__weights" : ["uniform", "distance"],
    "clf__algorithm" : ["auto", "ball_tree", "kd_tree", "brute"],
    "clf__leaf_size" : sp_randint(10, 40),
    "clf__p" : [1, 2]
    }



In [7]:

    
pipeline = Pipeline([
        ('featureSelection', SelectKBest(f_classif)),
        ('clf', KNeighborsClassifier())
    ])
rand_search = RandomizedSearchCV(pipeline, params, n_iter=50, scoring='log_loss')
start = time()
rand_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
      % (time() - start, len(rand_search.grid_scores_)))
report(rand_search.grid_scores_)
predictions = rand_search.predict_proba(df_test)
output = pd.DataFrame(predictions, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
output.index.names = ['ID']
output.index += 1
output.head()









    



GridSearchCV took 169.49 seconds for 50 candidate parameter settings.
Model with rank: 1
Mean validation score: -1.537 (std: 0.084)
Parameters: {'clf__algorithm': 'kd_tree', 'featureSelection__k': 3, 'clf__p': 2, 'clf__weights': 'uniform', 'clf__n_neighbors': 29, 'clf__leaf_size': 19}

Model with rank: 2
Mean validation score: -1.582 (std: 0.007)
Parameters: {'clf__algorithm': 'ball_tree', 'featureSelection__k': 2, 'clf__p': 1, 'clf__weights': 'distance', 'clf__n_neighbors': 28, 'clf__leaf_size': 38}

Model with rank: 3
Mean validation score: -1.583 (std: 0.097)
Parameters: {'clf__algorithm': 'brute', 'featureSelection__k': 3, 'clf__p': 1, 'clf__weights': 'uniform', 'clf__n_neighbors': 27, 'clf__leaf_size': 29}







    Out[7]:






  
    
      
      Adoption
      Died
      Euthanasia
      Return_to_owner
      Transfer
    
  
  
    
      1
      0.137931
      0.0
      0.241379
      0.275862
      0.344828
    
    
      2
      0.586207
      0.0
      0.034483
      0.241379
      0.137931
    
    
      3
      0.551724
      0.0
      0.034483
      0.103448
      0.310345
    
    
      4
      0.000000
      0.0
      0.068966
      0.413793
      0.517241
    
    
      5
      0.517241
      0.0
      0.068966
      0.344828
      0.068966



In [8]:

    
output.to_csv('../submission-KNN.3.0.csv', index_label = 'ID')

	Adoption	Euthanasia	Return_to_owner	Transfer
1	0.137931	0.241379	0.275862	0.344828
2	0.586207	0.034483	0.241379	0.137931
3	0.551724	0.034483	0.103448	0.310345
4	0.000000	0.068966	0.413793	0.517241
5	0.517241	0.068966	0.344828	0.068966