In [4]:
from datetime import datetime
import numpy as np
import matplotlib.pyplot as plt
from time import time
import seaborn as sns
import pandas as pd
import scipy
import zipfile
from matplotlib.backends.backend_pdf import PdfPages

In [5]:
z = zipfile.ZipFile('train.csv.zip')

In [18]:
train = pd.read_csv(z.open('train.csv'), parse_dates='Dates')[['X', 'Y', 'Category']]

In [21]:
mark = np.random.rand(len(train)) < 0.8

In [23]:
knn_train = train[mark]
knn_test = train[~mark]

In [24]:
x = knn_train[['X', 'Y']]
y = knn_train['Category'].astype('category')

In [25]:
from sklearn.neighbors import KNeighborsClassifier

In [26]:
knn = KNeighborsClassifier(n_neighbors=10)

In [27]:
knn.fit(x, y)


Out[27]:
KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_neighbors=10, p=2, weights='uniform')

In [28]:
xx = knn_test[['X', 'Y']]
yy = knn_test['Category'].astype('category')

In [46]:
zz = zipfile.ZipFile('test.csv.zip')
test = pd.read_csv(zz.open('test.csv'), parse_dates=['Dates'])

In [53]:
submit = pd.DataFrame({'Id': test.Id.tolist()})
p = knn.predict(test[['X', 'Y']])
for category in y.cat.categories:
    submit[category] = np.where(p == category, 1, 0)
    
submit.to_csv('k_nearest_neigbour.csv', index = False)