In [4]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [7]:
train_df_ = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df_.shape)
print(test_df.shape)


(878049, 9)
(884262, 7)

In [8]:
train_df = train_df_.sample(frac=0.1)

In [18]:
from datetime import time, datetime, date

def add_minutes_column(df):
    """
    add extra column that contains only time without date information
    for further filtering by time slot (see filter_by_timeslot())
    """
    time_only = pd.to_datetime(df["Dates"]).apply(lambda x: x.time())
    # combine time with dummy date 
    df["Minutes"] = time_only.apply(lambda x: datetime.combine(date(2000,3,3), x))
    
    return df

def filter_by_timeslot(train_df, middle_of_interval, minutes):
    """
    filter rows from train set that are within a time slot from 
    a current test element
    
    @train_df: train dataframe
    @middle_of_interval: timestamp that describes middle of the time slot
    @minutes: size of half of time slot (in both directions)
    
    returns: a dataframe with data that fulfills a timeslot condition
    """
    # pandas.tslib.Timestamp
    b = middle_of_interval

    zero_del = pd.Timedelta('0 sec')

    # define deltas for positive and negative differences
    pos_del = pd.Timedelta(minutes=minutes)
    neg_del = -pd.Timedelta(minutes=minutes)

    # find rows that are later than current test row
    positive = train_df.loc[(train_df["Minutes"] - b) > zero_del]
    # find rows that are before current test row
    negative = train_df.loc[(train_df["Minutes"] - b) < zero_del]
    
    pos_fil = positive.loc[(train_df["Minutes"] - b) < pos_del]
    neg_fil = negative.loc[(train_df["Minutes"] - b) > neg_del]
    
    return pd.concat([pos_fil,neg_fil])

In [10]:
train_df = add_minutes_column(train_df)

In [26]:
train_df.head()


Out[26]:
Dates Category Descript DayOfWeek PdDistrict Resolution Address X Y Minutes
3488 2015-04-27 23:14:00 WARRANTS ENROUTE TO DEPARTMENT OF CORRECTIONS Monday RICHMOND ARREST, BOOKED GEARY BL / 33RD AV -122.493297 37.779686 2000-03-03 23:14:00
686714 2005-08-22 19:00:00 LARCENY/THEFT ATTEMPTED THEFT FROM LOCKED VEHICLE Monday NORTHERN NONE MCALLISTER ST / VANNESS AV -122.420250 37.780075 2000-03-03 19:00:00
744468 2004-10-15 13:00:00 SUSPICIOUS OCC SUSPICIOUS OCCURRENCE Friday MISSION NONE 2800 Block of 24TH ST -122.408665 37.752879 2000-03-03 13:00:00
26239 2015-01-08 10:13:00 STOLEN PROPERTY STOLEN PROPERTY, POSSESSION WITH KNOWLEDGE, RE... Thursday TARAVAL ARREST, BOOKED 2300 Block of 14TH AV -122.470368 37.744203 2000-03-03 10:13:00
271920 2011-09-06 09:00:00 NON-CRIMINAL PROPERTY FOR IDENTIFICATION Tuesday SOUTHERN NONE 800 Block of BRYANT ST -122.403405 37.775421 2000-03-03 09:00:00

In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(train_df, test_size=0.3, random_state=1)

In [23]:
from sklearn.neighbors import KNeighborsClassifier
from itertools import repeat
import time

all_classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS']

y_name = 'Category'
X_names = ['X', 'Y']

max_rows = len(test)
#max_rows = 3
y_probs = np.zeros((max_rows, 39))

for i in range(max_rows):
    start_time = time.time()
    
    current_test_row = test.iloc[[i]]
    filtered_train = filter_by_timeslot(train, current_test_row["Minutes"].iloc[0], minutes=20)
    X_train = filtered_train[X_names]
    y_train = filtered_train[y_name]
    
    X_test = current_test_row[X_names]
        
    print(time.time() - start_time)
    start_time = time.time()

    
    clf = KNeighborsClassifier(n_neighbors = 500, n_jobs=1)
    clf.fit(X_train, y_train)

    
    # determine the classes that were not present in the training set;
    # the ones that were are listed in clf.classes_.
    classes_not_trained = set(clf.classes_).symmetric_difference(all_classes)
    print(time.time() - start_time)
    start_time = time.time()

    # the order of classes in predict_proba's output matches that in clf.classes_.
    prob = clf.predict_proba(X_test)
    new_prob = []
    for row in prob:
        prob_per_class = list(zip(clf.classes_, prob[0])) + list(zip(classes_not_trained, repeat(0.)))
        # put the probabilities in class order
        prob_per_class = sorted(prob_per_class)
        new_prob = [i[1] for i in prob_per_class]
    new_prob = np.asarray(new_prob)
    print(time.time() - start_time)
    print()

    # add prediction probabilities for current row
    y_probs[i] = new_prob
    if i % 100 == 0:
        print("Iteration {} out of {}".format(i, max_rows))


0.036954641342163086
0.004361152648925781
0.0055522918701171875

Iteration 0 out of 26342
0.032051801681518555
0.001913309097290039
0.023064374923706055

0.033803462982177734
0.003193378448486328
0.005626678466796875

0.032321929931640625
0.005945920944213867
0.005591869354248047

0.0322880744934082
0.0021796226501464844
0.005434751510620117

0.04564714431762695
0.0033674240112304688
0.005579471588134766

0.03286862373352051
0.0074291229248046875
0.005410194396972656

0.03284788131713867
0.005298137664794922
0.005423784255981445

0.032095909118652344
0.00433802604675293
0.005357027053833008

0.03104567527770996
0.002360105514526367
0.005465030670166016

0.03224372863769531
0.0020694732666015625
0.005217075347900391

0.047269582748413086
0.003337860107421875
0.005617618560791016

0.032241106033325195
0.0020384788513183594
0.0050699710845947266

0.03137969970703125
0.0035860538482666016
0.005471706390380859

0.03216409683227539
0.006409406661987305
0.0054531097412109375

0.03360795974731445
0.0034897327423095703
0.005488395690917969

0.031015634536743164
0.002880573272705078
0.005453348159790039

0.048334360122680664
0.005198001861572266
0.0054779052734375

0.032985687255859375
0.004025936126708984
0.0053446292877197266

0.031113624572753906
0.0019183158874511719
0.005191802978515625

0.032048940658569336
0.0015859603881835938
0.005026102066040039

0.031340837478637695
0.0031194686889648438
0.005494832992553711

0.03148508071899414
0.004509925842285156
0.005383968353271484

0.07640480995178223
0.0020749568939208984
0.005436897277832031

0.03253579139709473
0.0015912055969238281
0.006906270980834961

0.03199625015258789
0.003233671188354492
0.0056078433990478516

0.032332420349121094
0.00805521011352539
0.005347728729248047

0.032251596450805664
0.002344369888305664
0.005196094512939453

0.031191110610961914
0.003267526626586914
0.0054225921630859375

0.04635024070739746
0.0028235912322998047
0.00516200065612793

0.031855106353759766
0.002004861831665039
0.00501561164855957

0.03146958351135254
0.005861997604370117
0.005523681640625

0.03212285041809082
0.0032155513763427734
0.005269765853881836

0.031407833099365234
0.0038754940032958984
0.005493879318237305

0.032012224197387695
0.007847309112548828
0.005635261535644531

0.04733633995056152
0.002039194107055664
0.005404949188232422

0.03333759307861328
0.0031044483184814453
0.005450725555419922

0.03160381317138672
0.003842592239379883
0.0053157806396484375

0.030268192291259766
0.001583099365234375
0.0053098201751708984

0.031185626983642578
0.003940105438232422
0.00541377067565918

0.031931161880493164
0.005433082580566406
0.005437612533569336

0.046407222747802734
0.0021860599517822266
0.005168914794921875

0.032234907150268555
0.0017426013946533203
0.005264759063720703

0.030699729919433594
0.001787424087524414
0.004945516586303711

0.03153824806213379
0.0026357173919677734
0.005189418792724609

0.03190875053405762
0.0034432411193847656
0.005613088607788086

0.03202390670776367
0.0021255016326904297
0.0052487850189208984

0.04638075828552246
0.005827665328979492
0.005399227142333984

0.033594608306884766
0.006373167037963867
0.0053746700286865234

0.03412151336669922
0.005888938903808594
0.005391836166381836

0.03206014633178711
0.0067119598388671875
0.005419254302978516

0.032111406326293945
0.005498409271240234
0.005328655242919922

0.0314326286315918
0.0011043548583984375
---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-23-19582dadeab3> in <module>()
     47 
     48     # the order of classes in predict_proba's output matches that in clf.classes_.
---> 49     prob = clf.predict_proba(X_test)
     50     new_prob = []
     51     for row in prob:

/gpfs/software/x86_64/anaconda/envs/anaconda431-py35/lib/python3.5/site-packages/sklearn/neighbors/classification.py in predict_proba(self, X)
    188         X = check_array(X, accept_sparse='csr')
    189 
--> 190         neigh_dist, neigh_ind = self.kneighbors(X)
    191 
    192         classes_ = self.classes_

/gpfs/software/x86_64/anaconda/envs/anaconda431-py35/lib/python3.5/site-packages/sklearn/neighbors/base.py in kneighbors(self, X, n_neighbors, return_distance)
    341                 "Expected n_neighbors <= n_samples, "
    342                 " but n_samples = %d, n_neighbors = %d" %
--> 343                 (train_size, n_neighbors)
    344             )
    345         n_samples, _ = X.shape

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 343, n_neighbors = 500

In [17]:
from sklearn.metrics import log_loss

y_test = test[y_name]

score = log_loss(y_test, y_probs, labels=all_classes)
print("Score: {}".format(score))


Score: 20.72696614350754