In [1]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.grid_search import RandomizedSearchCV
from sklearn.pipeline import Pipeline
from scipy.stats import randint as sp_randint
from time import time
from operator import itemgetter
import numpy as np
import pandas as pd
In [2]:
df_train = pd.read_csv('../Shelter_train.csv')
df_test = pd.read_csv('../Shelter_test.csv')
In [3]:
X = df_train.ix[:, :-1]
y = df_train.ix[:, -1]
df_test = df_test.drop('ID', 1)
In [4]:
clf = KNeighborsClassifier()
cross_validation.cross_val_score(clf, X, y, scoring="log_loss")
Out[4]:
In [5]:
def report(grid_scores, n_top=3):
top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
for i, score in enumerate(top_scores):
print("Model with rank: {0}".format(i + 1))
print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
score.mean_validation_score,
np.std(score.cv_validation_scores)))
print("Parameters: {0}".format(score.parameters))
print("")
In [6]:
params = {
"featureSelection__k" : [2, 3, 4, 5, 6, 7, 8],
"clf__n_neighbors" : sp_randint(1, 30),
"clf__weights" : ["uniform", "distance"],
"clf__algorithm" : ["auto", "ball_tree", "kd_tree", "brute"],
"clf__leaf_size" : sp_randint(10, 40),
"clf__p" : [1, 2]
}
In [7]:
pipeline = Pipeline([
('featureSelection', SelectKBest(f_classif)),
('clf', KNeighborsClassifier())
])
rand_search = RandomizedSearchCV(pipeline, params, n_iter=50, scoring='log_loss')
start = time()
rand_search.fit(X, y)
print("GridSearchCV took %.2f seconds for %d candidate parameter settings."
% (time() - start, len(rand_search.grid_scores_)))
report(rand_search.grid_scores_)
predictions = rand_search.predict_proba(df_test)
output = pd.DataFrame(predictions, columns=['Adoption', 'Died', 'Euthanasia', 'Return_to_owner', 'Transfer'])
output.index.names = ['ID']
output.index += 1
output.head()
Out[7]:
In [8]:
output.to_csv('../submission-KNN.3.0.csv', index_label = 'ID')