In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)
In [3]:
train_df.head()
Out[3]:
In [4]:
test_df.head()
Out[4]:
In [5]:
y_name = 'Category'
X_names = ['X', 'Y']
X = train_df[X_names]
y = train_df[y_name]
print(X.shape)
print(y.shape)
In [6]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss
clf = KNeighborsClassifier(n_neighbors = 5000, n_jobs=8)
clf.fit(X_train, y_train)
y_probs = clf.predict_proba(X_test)
score = log_loss(y_test, y_probs)
print("Score: {}".format(score))
In [ ]:
y_probs[0:10,:]
In [ ]:
y_test.sort_index()
In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
param_grid = {"n_neighbors" : list(range(20, 40))}
scoring = 'neg_log_loss'
clf2 = GridSearchCV(KNeighborsClassifier(), param_grid, scoring=scoring, n_jobs=8)
clf2.fit(X_train, y_train)
print("All computations DONE.")
In [ ]:
# best parameters found
print("Best parameters:")
print(clf.best_params_)
print("With log_loss:")
print(clf.best_score_)
print(clf.grid_scores_)
print(clf.scorer_)
In [ ]:
# first submission
from sklearn.neighbors import KNeighborsClassifier
clf = KNeighborsClassifier(n_neighbors = 3, n_jobs=4)
clf.fit(X, y)
y_probs = clf.predict_proba(test_df[X_names])
classes = clf.classes_
In [ ]:
len(clf.classes_)
In [ ]:
submission_baseline_df = pd.DataFrame(data = y_probs, index=list(range(len(y_probs))), columns=classes)
submission_baseline_df.index.names = ['Id']
In [ ]:
submission_baseline_df.head()
In [ ]:
submission_baseline_df.columns.values
In [ ]:
submission_baseline_df.to_csv("baseline_knn.csv")
print("DONE")
In [ ]:
submission_test = pd.read_csv("baseline_knn.csv")
submission_test.shape