In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

In [2]:
train_df = pd.read_csv('train.csv')
#test_df = pd.read_csv('test.csv')
print(train_df.shape)
#print(test_df.shape)


(878049, 9)

In [3]:
#train_df = train_df_.sample(frac=0.1)

In [4]:
from datetime import time, datetime, date

def add_minutes_column(df):
    """
    add extra column that contains only time without date information
    for further filtering by time slot (see filter_by_timeslot())
    """
    time_only = pd.to_datetime(df["Dates"]).apply(lambda x: x.time())
    # combine time with dummy date 
    df["Minutes"] = time_only.apply(lambda x: datetime.combine(date(2000,3,3), x))
    df["Minutes"] = pd.to_datetime(df["Minutes"])
    
    return df

def filter_by_timeslot(df, middle_of_interval, mins):
    """
    filter rows from data set that are within a time slot from 
    a the middle_of_interval
    
    @df:  dataframe
    @middle_of_interval: timestamp that describes middle of the time slot
    @minutes: size of half of time slot (in both directions)
    
    returns: a dataframe with data that fulfills a timeslot condition
    """

    time_start = middle_of_interval - timedelta(minutes=mins)
    time_end   = middle_of_interval + timedelta(minutes=mins)
    

    
    if (time_start.day != time_end.day):
        time_start = time_start + timedelta(days=1)
        time_s = time_start.strftime("%Y-%m-%d %H:%M:%S")
        time_e = time_end.strftime("%Y-%m-%d %H:%M:%S")
       
        return df[(df["Minutes"] > time_s) | (df["Minutes"] <= time_e )]

    else:
        time_s = time_start.strftime("%Y-%m-%d %H:%M:%S")
        time_e = time_end.strftime("%Y-%m-%d %H:%M:%S")  

        return df[(df["Minutes"] > time_s) & (df["Minutes"] <= time_e )]

In [5]:
train_df = add_minutes_column(train_df)

In [6]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(train_df, test_size=0.3, random_state=1)

In [44]:
from sklearn.neighbors import KNeighborsClassifier
from itertools import repeat
import time
from datetime import timedelta

all_classes = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS']

y_name = 'Category'
X_names = ['X', 'Y']

max_rows = len(test)
#max_rows = 3
#y_probs = np.zeros((max_rows, 39))
y_probs = []
mid_interval = datetime(2000,3,3,0)

step_mins = 10
delta_interval = timedelta(minutes=step_mins)
half_width_train_interval_minutes = step_mins 
half_width_test_interval_minutes = step_mins//2


num_slots = 24/ (step_mins/60)

for i in range(int(num_slots)):
    #start_time = time.time()
    filtered_train = filter_by_timeslot(train, mid_interval, mins=half_width_train_interval_minutes)
    #filtered_train = train
    X_train = filtered_train[X_names]
    y_train = filtered_train[y_name]
    
    filtered_test = filter_by_timeslot(test, mid_interval, mins=half_width_test_interval_minutes)
    #filtered_test = test
    #print(filtered_test.sort_values(by="Minutes")["Minutes"])

    #print(filtered_test.sort_values(by="Minutes")["Minutes"])
    
    
    X_test = filtered_test[X_names]
    print(mid_interval, len(X_test))

    
    #print(type(X_test.index.values))
    #print(X_test.index.values.shape)
        
    #print(time.time() - start_time)
    #start_time = time.time()

    
    clf = KNeighborsClassifier(n_neighbors = 400, n_jobs=4)
    clf.fit(X_train, y_train)

    
    # determine the classes that were not present in the training set;
    # the ones that were are listed in clf.classes_.
    classes_not_trained = set(clf.classes_).symmetric_difference(all_classes)
    #print(time.time() - start_time)
    #start_time = time.time()

    # the order of classes in predict_proba's output matches that in clf.classes_.
    prob = clf.predict_proba(X_test)
    #print(prob)
    new_prob = []
    for row in prob:
        prob_per_class = list(zip(clf.classes_, row)) + list(zip(classes_not_trained, repeat(0.)))
        # put the probabilities in class order
        prob_per_class = sorted(prob_per_class)
        new_prob.append([i[1] for i in prob_per_class])
    new_prob = np.asarray(new_prob)
    #print(new_prob)
    
    new_prob = np.c_[X_test.index.values, new_prob]
    #print(new_prob.shape)

    #print(time.time() - start_time)
    #print()

    # add prediction probabilities for current row
    y_probs.append(new_prob)
    
    mid_interval += delta_interval
    

print("DONE")


2000-03-03 00:00:00 7717
2000-03-03 00:10:00 1338
2000-03-03 00:20:00 921
2000-03-03 00:30:00 1862
2000-03-03 00:40:00 1017
2000-03-03 00:50:00 836
2000-03-03 01:00:00 2759
2000-03-03 01:10:00 911
2000-03-03 01:20:00 845
2000-03-03 01:30:00 1567
2000-03-03 01:40:00 975
2000-03-03 01:50:00 832
2000-03-03 02:00:00 2533
2000-03-03 02:10:00 959
2000-03-03 02:20:00 793
2000-03-03 02:30:00 1234
2000-03-03 02:40:00 676
2000-03-03 02:50:00 572
2000-03-03 03:00:00 1528
2000-03-03 03:10:00 594
2000-03-03 03:20:00 492
2000-03-03 03:30:00 777
2000-03-03 03:40:00 479
2000-03-03 03:50:00 416
2000-03-03 04:00:00 999
2000-03-03 04:10:00 411
2000-03-03 04:20:00 327
2000-03-03 04:30:00 516
2000-03-03 04:40:00 412
2000-03-03 04:50:00 293
2000-03-03 05:00:00 796
2000-03-03 05:10:00 349
2000-03-03 05:20:00 246
2000-03-03 05:30:00 555
2000-03-03 05:40:00 372
2000-03-03 05:50:00 315
2000-03-03 06:00:00 1160
2000-03-03 06:10:00 389
2000-03-03 06:20:00 335
2000-03-03 06:30:00 810
2000-03-03 06:40:00 592
2000-03-03 06:50:00 496
2000-03-03 07:00:00 1993
2000-03-03 07:10:00 803
2000-03-03 07:20:00 653
2000-03-03 07:30:00 1539
2000-03-03 07:40:00 983
2000-03-03 07:50:00 667
2000-03-03 08:00:00 4017
2000-03-03 08:10:00 1045
2000-03-03 08:20:00 840
2000-03-03 08:30:00 2063
2000-03-03 08:40:00 1109
2000-03-03 08:50:00 875
2000-03-03 09:00:00 4261
2000-03-03 09:10:00 1133
2000-03-03 09:20:00 970
2000-03-03 09:30:00 2158
2000-03-03 09:40:00 1176
2000-03-03 09:50:00 943
2000-03-03 10:00:00 4363
2000-03-03 10:10:00 1291
2000-03-03 10:20:00 1049
2000-03-03 10:30:00 2288
2000-03-03 10:40:00 1398
2000-03-03 10:50:00 990
2000-03-03 11:00:00 3851
2000-03-03 11:10:00 1347
2000-03-03 11:20:00 1134
2000-03-03 11:30:00 2444
2000-03-03 11:40:00 1550
2000-03-03 11:50:00 1196
2000-03-03 12:00:00 7581
2000-03-03 12:10:00 1538
2000-03-03 12:20:00 1157
2000-03-03 12:30:00 2627
2000-03-03 12:40:00 1645
2000-03-03 12:50:00 1215
2000-03-03 13:00:00 4565
2000-03-03 13:10:00 1617
2000-03-03 13:20:00 1215
2000-03-03 13:30:00 2659
2000-03-03 13:40:00 1679
2000-03-03 13:50:00 1278
2000-03-03 14:00:00 4720
2000-03-03 14:10:00 1627
2000-03-03 14:20:00 1294
2000-03-03 14:30:00 2711
2000-03-03 14:40:00 1679
2000-03-03 14:50:00 1240
2000-03-03 15:00:00 5298
2000-03-03 15:10:00 1684
2000-03-03 15:20:00 1259
2000-03-03 15:30:00 3040
2000-03-03 15:40:00 1833
2000-03-03 15:50:00 1223
2000-03-03 16:00:00 5139
2000-03-03 16:10:00 1684
2000-03-03 16:20:00 1506
2000-03-03 16:30:00 3112
2000-03-03 16:40:00 1946
2000-03-03 16:50:00 1425
2000-03-03 17:00:00 6004
2000-03-03 17:10:00 1812
2000-03-03 17:20:00 1555
2000-03-03 17:30:00 3251
2000-03-03 17:40:00 1878
2000-03-03 17:50:00 1478
2000-03-03 18:00:00 6798
2000-03-03 18:10:00 1767
2000-03-03 18:20:00 1391
2000-03-03 18:30:00 3493
2000-03-03 18:40:00 1889
2000-03-03 18:50:00 1242
2000-03-03 19:00:00 5693
2000-03-03 19:10:00 1715
2000-03-03 19:20:00 1399
2000-03-03 19:30:00 3204
2000-03-03 19:40:00 1682
2000-03-03 19:50:00 1175
2000-03-03 20:00:00 5647
2000-03-03 20:10:00 1497
2000-03-03 20:20:00 1123
2000-03-03 20:30:00 2818
2000-03-03 20:40:00 1339
2000-03-03 20:50:00 1038
2000-03-03 21:00:00 5186
2000-03-03 21:10:00 1302
2000-03-03 21:20:00 1090
2000-03-03 21:30:00 2752
2000-03-03 21:40:00 1503
2000-03-03 21:50:00 1120
2000-03-03 22:00:00 5507
2000-03-03 22:10:00 1422
2000-03-03 22:20:00 1221
2000-03-03 22:30:00 2952
2000-03-03 22:40:00 1468
2000-03-03 22:50:00 1123
2000-03-03 23:00:00 4861
2000-03-03 23:10:00 1298
2000-03-03 23:20:00 1138
2000-03-03 23:30:00 2616
2000-03-03 23:40:00 1430
2000-03-03 23:50:00 1207
DONE

In [45]:
y_probs_np = np.vstack(y_probs)
y_probs_df = pd.DataFrame(data=y_probs_np[:,1:], index=y_probs_np[:,0], columns=all_classes)
y_probs_df.sort_index(inplace=True)
y_test = test[y_name].sort_index()

In [46]:
from sklearn.metrics import log_loss

#y_probs_np = np.vstack(y_probs)
#y_probs_np.sort(axis=0)

#y_test = test[y_name].sort_index()
#y_test = test[y_name]


score = log_loss(y_test, y_probs_df, labels=all_classes)
print("Score: {}".format(score))


Score: 2.8401297397213683

In [ ]:
from sklearn.metrics import log_loss

y_probs_np = np.vstack(y_probs)
y_probs_np.sort(axis=0)

y_test = test[y_name]

y_probs = clf.predict_proba(X_test)

score = log_loss(y_test, y_probs, labels=all_classes)
print("Score: {}".format(score))

In [ ]:
y_probs

In [ ]:
y_probs_np = np.vstack(y_probs)

In [ ]:
y_test = test[y_name].sort_index()
y_test.head()

In [ ]:
y_probs_np.sort(axis=0)
y_probs_np[0:3,1:]

In [ ]:
y_probs_np[1]