In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df.shape)
print(test_df.shape)


(878049, 9)
(884262, 7)

In [3]:
train_df.head()


Out[3]:
Dates Category Descript DayOfWeek PdDistrict Resolution Address X Y
0 2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
1 2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2 2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.424363 37.800414
3 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.426995 37.800873
4 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday PARK NONE 100 Block of BRODERICK ST -122.438738 37.771541

In [4]:
test_df.head()


Out[4]:
Id Dates DayOfWeek PdDistrict Address X Y
0 0 2015-05-10 23:59:00 Sunday BAYVIEW 2000 Block of THOMAS AV -122.399588 37.735051
1 1 2015-05-10 23:51:00 Sunday BAYVIEW 3RD ST / REVERE AV -122.391523 37.732432
2 2 2015-05-10 23:50:00 Sunday NORTHERN 2000 Block of GOUGH ST -122.426002 37.792212
3 3 2015-05-10 23:45:00 Sunday INGLESIDE 4700 Block of MISSION ST -122.437394 37.721412
4 4 2015-05-10 23:45:00 Sunday INGLESIDE 4700 Block of MISSION ST -122.437394 37.721412

In [5]:
y_name = 'Category'
X_names = ['X', 'Y']

X = train_df[X_names]
y = train_df[y_name]


print(X.shape)
print(y.shape)


(878049, 2)
(878049,)

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import log_loss

clf = KNeighborsClassifier(n_neighbors = 5000, n_jobs=8)
clf.fit(X_train, y_train)

y_probs = clf.predict_proba(X_test)
score = log_loss(y_test, y_probs)
print("Score: {}".format(score))

In [ ]:
y_probs[0:10,:]

In [ ]:
y_test.sort_index()

In [ ]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

param_grid = {"n_neighbors" : list(range(20, 40))}
scoring = 'neg_log_loss'

clf2 = GridSearchCV(KNeighborsClassifier(), param_grid, scoring=scoring, n_jobs=8)
clf2.fit(X_train, y_train)

print("All computations DONE.")

In [ ]:
# best parameters found
print("Best parameters:")
print(clf.best_params_)
print("With log_loss:")
print(clf.best_score_)

print(clf.grid_scores_)
print(clf.scorer_)

Baseline submission Score 22


In [ ]:
# first submission
from sklearn.neighbors import KNeighborsClassifier

clf = KNeighborsClassifier(n_neighbors = 3, n_jobs=4)
clf.fit(X, y)
y_probs = clf.predict_proba(test_df[X_names])

classes = clf.classes_

In [ ]:
len(clf.classes_)

In [ ]:
submission_baseline_df = pd.DataFrame(data = y_probs, index=list(range(len(y_probs))), columns=classes)
submission_baseline_df.index.names = ['Id']

In [ ]:
submission_baseline_df.head()

In [ ]:
submission_baseline_df.columns.values

In [ ]:
submission_baseline_df.to_csv("baseline_knn.csv")
print("DONE")

In [ ]:
submission_test = pd.read_csv("baseline_knn.csv")
submission_test.shape