notebook.community

Edit and run



In [4]:

    
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt



In [7]:

    
train_df_ = pd.read_csv('train.csv')
test_df = pd.read_csv('test.csv')
print(train_df_.shape)
print(test_df.shape)









    



(878049, 9)
(884262, 7)



In [8]:

    
train_df = train_df_.sample(frac=0.1)



In [18]:

    
from datetime import time, datetime, date

def add_minutes_column(df):
    """
    add extra column that contains only time without date information
    for further filtering by time slot (see filter_by_timeslot())
    """
    time_only = pd.to_datetime(df["Dates"]).apply(lambda x: x.time())
    # combine time with dummy date 
    df["Minutes"] = time_only.apply(lambda x: datetime.combine(date(2000,3,3), x))
    
    return df

def filter_by_timeslot(train_df, middle_of_interval, minutes):
    """
    filter rows from train set that are within a time slot from 
    a current test element
    
    @train_df: train dataframe
    @middle_of_interval: timestamp that describes middle of the time slot
    @minutes: size of half of time slot (in both directions)
    
    returns: a dataframe with data that fulfills a timeslot condition
    """
    # pandas.tslib.Timestamp
    b = middle_of_interval

    zero_del = pd.Timedelta('0 sec')

    # define deltas for positive and negative differences
    pos_del = pd.Timedelta(minutes=minutes)
    neg_del = -pd.Timedelta(minutes=minutes)

    # find rows that are later than current test row
    positive = train_df.loc[(train_df["Minutes"] - b) > zero_del]
    # find rows that are before current test row
    negative = train_df.loc[(train_df["Minutes"] - b) < zero_del]
    
    pos_fil = positive.loc[(train_df["Minutes"] - b) < pos_del]
    neg_fil = negative.loc[(train_df["Minutes"] - b) > neg_del]
    
    return pd.concat([pos_fil,neg_fil])



In [10]:

    
train_df = add_minutes_column(train_df)



In [26]:

    
train_df.head()









    Out[26]:






  
    
      
      Dates
      Category
      Descript
      DayOfWeek
      PdDistrict
      Resolution
      Address
      X
      Y
      Minutes
    
  
  
    
      3488
      2015-04-27 23:14:00
      WARRANTS
      ENROUTE TO DEPARTMENT OF CORRECTIONS
      Monday
      RICHMOND
      ARREST, BOOKED
      GEARY BL / 33RD AV
      -122.493297
      37.779686
      2000-03-03 23:14:00
    
    
      686714
      2005-08-22 19:00:00
      LARCENY/THEFT
      ATTEMPTED THEFT FROM LOCKED VEHICLE
      Monday
      NORTHERN
      NONE
      MCALLISTER ST / VANNESS AV
      -122.420250
      37.780075
      2000-03-03 19:00:00
    
    
      744468
      2004-10-15 13:00:00
      SUSPICIOUS OCC
      SUSPICIOUS OCCURRENCE
      Friday
      MISSION
      NONE
      2800 Block of 24TH ST
      -122.408665
      37.752879
      2000-03-03 13:00:00
    
    
      26239
      2015-01-08 10:13:00
      STOLEN PROPERTY
      STOLEN PROPERTY, POSSESSION WITH KNOWLEDGE, RE...
      Thursday
      TARAVAL
      ARREST, BOOKED
      2300 Block of 14TH AV
      -122.470368
      37.744203
      2000-03-03 10:13:00
    
    
      271920
      2011-09-06 09:00:00
      NON-CRIMINAL
      PROPERTY FOR IDENTIFICATION
      Tuesday
      SOUTHERN
      NONE
      800 Block of BRYANT ST
      -122.403405
      37.775421
      2000-03-03 09:00:00



In [11]:

    
from sklearn.model_selection import train_test_split

train, test = train_test_split(train_df, test_size=0.3, random_state=1)



In [23]:

    
from sklearn.neighbors import KNeighborsClassifier
from itertools import repeat
import time

all_classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS']

y_name = 'Category'
X_names = ['X', 'Y']

max_rows = len(test)
#max_rows = 3
y_probs = np.zeros((max_rows, 39))

for i in range(max_rows):
    start_time = time.time()
    
    current_test_row = test.iloc[[i]]
    filtered_train = filter_by_timeslot(train, current_test_row["Minutes"].iloc[0], minutes=20)
    X_train = filtered_train[X_names]
    y_train = filtered_train[y_name]
    
    X_test = current_test_row[X_names]
        
    print(time.time() - start_time)
    start_time = time.time()

    
    clf = KNeighborsClassifier(n_neighbors = 500, n_jobs=1)
    clf.fit(X_train, y_train)

    
    # determine the classes that were not present in the training set;
    # the ones that were are listed in clf.classes_.
    classes_not_trained = set(clf.classes_).symmetric_difference(all_classes)
    print(time.time() - start_time)
    start_time = time.time()

    # the order of classes in predict_proba's output matches that in clf.classes_.
    prob = clf.predict_proba(X_test)
    new_prob = []
    for row in prob:
        prob_per_class = list(zip(clf.classes_, prob[0])) + list(zip(classes_not_trained, repeat(0.)))
        # put the probabilities in class order
        prob_per_class = sorted(prob_per_class)
        new_prob = [i[1] for i in prob_per_class]
    new_prob = np.asarray(new_prob)
    print(time.time() - start_time)
    print()

    # add prediction probabilities for current row
    y_probs[i] = new_prob
    if i % 100 == 0:
        print("Iteration {} out of {}".format(i, max_rows))









    



0.036954641342163086
0.004361152648925781
0.0055522918701171875

Iteration 0 out of 26342
0.032051801681518555
0.001913309097290039
0.023064374923706055

0.033803462982177734
0.003193378448486328
0.005626678466796875

0.032321929931640625
0.005945920944213867
0.005591869354248047

0.0322880744934082
0.0021796226501464844
0.005434751510620117

0.04564714431762695
0.0033674240112304688
0.005579471588134766

0.03286862373352051
0.0074291229248046875
0.005410194396972656

0.03284788131713867
0.005298137664794922
0.005423784255981445

0.032095909118652344
0.00433802604675293
0.005357027053833008

0.03104567527770996
0.002360105514526367
0.005465030670166016

0.03224372863769531
0.0020694732666015625
0.005217075347900391

0.047269582748413086
0.003337860107421875
0.005617618560791016

0.032241106033325195
0.0020384788513183594
0.0050699710845947266

0.03137969970703125
0.0035860538482666016
0.005471706390380859

0.03216409683227539
0.006409406661987305
0.0054531097412109375

0.03360795974731445
0.0034897327423095703
0.005488395690917969

0.031015634536743164
0.002880573272705078
0.005453348159790039

0.048334360122680664
0.005198001861572266
0.0054779052734375

0.032985687255859375
0.004025936126708984
0.0053446292877197266

0.031113624572753906
0.0019183158874511719
0.005191802978515625

0.032048940658569336
0.0015859603881835938
0.005026102066040039

0.031340837478637695
0.0031194686889648438
0.005494832992553711

0.03148508071899414
0.004509925842285156
0.005383968353271484

0.07640480995178223
0.0020749568939208984
0.005436897277832031

0.03253579139709473
0.0015912055969238281
0.006906270980834961

0.03199625015258789
0.003233671188354492
0.0056078433990478516

0.032332420349121094
0.00805521011352539
0.005347728729248047

0.032251596450805664
0.002344369888305664
0.005196094512939453

0.031191110610961914
0.003267526626586914
0.0054225921630859375

0.04635024070739746
0.0028235912322998047
0.00516200065612793

0.031855106353759766
0.002004861831665039
0.00501561164855957

0.03146958351135254
0.005861997604370117
0.005523681640625

0.03212285041809082
0.0032155513763427734
0.005269765853881836

0.031407833099365234
0.0038754940032958984
0.005493879318237305

0.032012224197387695
0.007847309112548828
0.005635261535644531

0.04733633995056152
0.002039194107055664
0.005404949188232422

0.03333759307861328
0.0031044483184814453
0.005450725555419922

0.03160381317138672
0.003842592239379883
0.0053157806396484375

0.030268192291259766
0.001583099365234375
0.0053098201751708984

0.031185626983642578
0.003940105438232422
0.00541377067565918

0.031931161880493164
0.005433082580566406
0.005437612533569336

0.046407222747802734
0.0021860599517822266
0.005168914794921875

0.032234907150268555
0.0017426013946533203
0.005264759063720703

0.030699729919433594
0.001787424087524414
0.004945516586303711

0.03153824806213379
0.0026357173919677734
0.005189418792724609

0.03190875053405762
0.0034432411193847656
0.005613088607788086

0.03202390670776367
0.0021255016326904297
0.0052487850189208984

0.04638075828552246
0.005827665328979492
0.005399227142333984

0.033594608306884766
0.006373167037963867
0.0053746700286865234

0.03412151336669922
0.005888938903808594
0.005391836166381836

0.03206014633178711
0.0067119598388671875
0.005419254302978516

0.032111406326293945
0.005498409271240234
0.005328655242919922

0.0314326286315918
0.0011043548583984375






    



---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-23-19582dadeab3> in <module>()
     47 
     48     # the order of classes in predict_proba's output matches that in clf.classes_.
---> 49     prob = clf.predict_proba(X_test)
     50     new_prob = []
     51     for row in prob:

/gpfs/software/x86_64/anaconda/envs/anaconda431-py35/lib/python3.5/site-packages/sklearn/neighbors/classification.py in predict_proba(self, X)
    188         X = check_array(X, accept_sparse='csr')
    189 
--> 190         neigh_dist, neigh_ind = self.kneighbors(X)
    191 
    192         classes_ = self.classes_

/gpfs/software/x86_64/anaconda/envs/anaconda431-py35/lib/python3.5/site-packages/sklearn/neighbors/base.py in kneighbors(self, X, n_neighbors, return_distance)
    341                 "Expected n_neighbors <= n_samples, "
    342                 " but n_samples = %d, n_neighbors = %d" %
--> 343                 (train_size, n_neighbors)
    344             )
    345         n_samples, _ = X.shape

ValueError: Expected n_neighbors <= n_samples,  but n_samples = 343, n_neighbors = 500



In [17]:

    
from sklearn.metrics import log_loss

y_test = test[y_name]

score = log_loss(y_test, y_probs, labels=all_classes)
print("Score: {}".format(score))









    



Score: 20.72696614350754

	Dates	Category	Descript	DayOfWeek	PdDistrict	Resolution	Address	X	Y	Minutes
3488	2015-04-27 23:14:00	WARRANTS	ENROUTE TO DEPARTMENT OF CORRECTIONS	Monday	RICHMOND	ARREST, BOOKED	GEARY BL / 33RD AV	-122.493297	37.779686	2000-03-03 23:14:00
686714	2005-08-22 19:00:00	LARCENY/THEFT	ATTEMPTED THEFT FROM LOCKED VEHICLE	Monday	NORTHERN	NONE	MCALLISTER ST / VANNESS AV	-122.420250	37.780075	2000-03-03 19:00:00
744468	2004-10-15 13:00:00	SUSPICIOUS OCC	SUSPICIOUS OCCURRENCE	Friday	MISSION	NONE	2800 Block of 24TH ST	-122.408665	37.752879	2000-03-03 13:00:00
26239	2015-01-08 10:13:00	STOLEN PROPERTY	STOLEN PROPERTY, POSSESSION WITH KNOWLEDGE, RE...	Thursday	TARAVAL	ARREST, BOOKED	2300 Block of 14TH AV	-122.470368	37.744203	2000-03-03 10:13:00
271920	2011-09-06 09:00:00	NON-CRIMINAL	PROPERTY FOR IDENTIFICATION	Tuesday	SOUTHERN	NONE	800 Block of BRYANT ST	-122.403405	37.775421	2000-03-03 09:00:00