Kaggle: San Francisco Crime Classification

Improvement as part of sdap17 excercise 3


In [1]:
import pandas as pd
import numpy as np
import pprint
import requests

Exploration of the training data set


In [2]:
train_data = pd.read_csv("../../data/raw/train.csv")
train_data['Dates'] = pd.to_datetime(train_data['Dates'])

test_data = pd.read_csv("../../data/raw/test.csv")
test_data['Dates'] = pd.to_datetime(test_data['Dates'])

In [3]:
len(train_data)


Out[3]:
878049

In [4]:
train_data.head()


Out[4]:
Dates Category Descript DayOfWeek PdDistrict Resolution Address X Y
0 2015-05-13 23:53:00 WARRANTS WARRANT ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
1 2015-05-13 23:53:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED OAK ST / LAGUNA ST -122.425892 37.774599
2 2015-05-13 23:33:00 OTHER OFFENSES TRAFFIC VIOLATION ARREST Wednesday NORTHERN ARREST, BOOKED VANNESS AV / GREENWICH ST -122.424363 37.800414
3 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday NORTHERN NONE 1500 Block of LOMBARD ST -122.426995 37.800873
4 2015-05-13 23:30:00 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Wednesday PARK NONE 100 Block of BRODERICK ST -122.438738 37.771541

In [5]:
crimes = train_data['Category'].unique()
pprint.pprint("Crimes: {}, #{}".format(crimes, len(crimes)), indent=2)


("Crimes: ['WARRANTS' 'OTHER OFFENSES' 'LARCENY/THEFT' 'VEHICLE THEFT' "
 "'VANDALISM'\n"
 " 'NON-CRIMINAL' 'ROBBERY' 'ASSAULT' 'WEAPON LAWS' 'BURGLARY'\n"
 " 'SUSPICIOUS OCC' 'DRUNKENNESS' 'FORGERY/COUNTERFEITING' 'DRUG/NARCOTIC'\n"
 " 'STOLEN PROPERTY' 'SECONDARY CODES' 'TRESPASS' 'MISSING PERSON' 'FRAUD'\n"
 " 'KIDNAPPING' 'RUNAWAY' 'DRIVING UNDER THE INFLUENCE'\n"
 " 'SEX OFFENSES FORCIBLE' 'PROSTITUTION' 'DISORDERLY CONDUCT' 'ARSON'\n"
 " 'FAMILY OFFENSES' 'LIQUOR LAWS' 'BRIBERY' 'EMBEZZLEMENT' 'SUICIDE'\n"
 " 'LOITERING' 'SEX OFFENSES NON FORCIBLE' 'EXTORTION' 'GAMBLING'\n"
 " 'BAD CHECKS' 'TREA' 'RECOVERED VEHICLE' 'PORNOGRAPHY/OBSCENE MAT'], #39")

In [6]:
train_data['Category'].value_counts()


Out[6]:
LARCENY/THEFT                  174900
OTHER OFFENSES                 126182
NON-CRIMINAL                    92304
ASSAULT                         76876
DRUG/NARCOTIC                   53971
VEHICLE THEFT                   53781
VANDALISM                       44725
WARRANTS                        42214
BURGLARY                        36755
SUSPICIOUS OCC                  31414
MISSING PERSON                  25989
ROBBERY                         23000
FRAUD                           16679
FORGERY/COUNTERFEITING          10609
SECONDARY CODES                  9985
WEAPON LAWS                      8555
PROSTITUTION                     7484
TRESPASS                         7326
STOLEN PROPERTY                  4540
SEX OFFENSES FORCIBLE            4388
DISORDERLY CONDUCT               4320
DRUNKENNESS                      4280
RECOVERED VEHICLE                3138
KIDNAPPING                       2341
DRIVING UNDER THE INFLUENCE      2268
RUNAWAY                          1946
LIQUOR LAWS                      1903
ARSON                            1513
LOITERING                        1225
EMBEZZLEMENT                     1166
SUICIDE                           508
FAMILY OFFENSES                   491
BAD CHECKS                        406
BRIBERY                           289
EXTORTION                         256
SEX OFFENSES NON FORCIBLE         148
GAMBLING                          146
PORNOGRAPHY/OBSCENE MAT            22
TREA                                6
Name: Category, dtype: int64

Generate time based features


In [7]:
def get_halfhour(minute):
    if minute < 30:
        return 0
    else:
        return 1

def get_daynight(hour):
    if 5 < hour and hour < 23:
        return 0
    else:
        return 1
    
def generate_time_features(times):
    minute_series = pd.Series([x.minute for x in times], name='minute')
    halfhour_series = pd.Series([get_halfhour(x.minute) for x in times], name='halfhour')
    hour_series = pd.Series([x.hour for x in times], name='hour')
    daynight_series = pd.Series([get_daynight(x.hour) for x in times], name='day_night')
    day_series = pd.Series([x.day for x in times], name='day')
    month_series = pd.Series([x.month for x in times], name='month')
    year_series = pd.Series([x.year for x in times], name='year')
    
    time_features = pd.concat([minute_series, halfhour_series, hour_series, daynight_series, day_series, month_series, year_series], axis=1)
    return time_features

In [8]:
times = train_data["Dates"]

In [9]:
time_features = generate_time_features(times)
print("success")


success

In [10]:
print(time_features)


        minute  halfhour  hour  day_night  day  month  year
0           53         1    23          1   13      5  2015
1           53         1    23          1   13      5  2015
2           33         1    23          1   13      5  2015
3           30         1    23          1   13      5  2015
4           30         1    23          1   13      5  2015
5           30         1    23          1   13      5  2015
6           30         1    23          1   13      5  2015
7           30         1    23          1   13      5  2015
8            0         0    23          1   13      5  2015
9            0         0    23          1   13      5  2015
10          58         1    22          0   13      5  2015
11          30         1    22          0   13      5  2015
12          30         1    22          0   13      5  2015
13           6         0    22          0   13      5  2015
14           0         0    22          0   13      5  2015
15           0         0    22          0   13      5  2015
16           0         0    22          0   13      5  2015
17          55         1    21          0   13      5  2015
18          40         1    21          0   13      5  2015
19          30         1    21          0   13      5  2015
20          30         1    21          0   13      5  2015
21          17         0    21          0   13      5  2015
22          11         0    21          0   13      5  2015
23          11         0    21          0   13      5  2015
24          10         0    21          0   13      5  2015
25           0         0    21          0   13      5  2015
26           0         0    21          0   13      5  2015
27           0         0    21          0   13      5  2015
28           0         0    21          0   13      5  2015
29          56         1    20          0   13      5  2015
...        ...       ...   ...        ...  ...    ...   ...
878019      37         1     2          1    6      1  2003
878020      32         1     2          1    6      1  2003
878021      24         0     2          1    6      1  2003
878022      16         0     2          1    6      1  2003
878023      15         0     2          1    6      1  2003
878024       9         0     2          1    6      1  2003
878025       6         0     2          1    6      1  2003
878026       6         0     2          1    6      1  2003
878027       0         0     2          1    6      1  2003
878028       0         0     2          1    6      1  2003
878029      54         1     1          1    6      1  2003
878030      54         1     1          1    6      1  2003
878031      50         1     1          1    6      1  2003
878032      36         1     1          1    6      1  2003
878033      30         1     1          1    6      1  2003
878034      30         1     1          1    6      1  2003
878035      55         1     0          1    6      1  2003
878036      55         1     0          1    6      1  2003
878037      55         1     0          1    6      1  2003
878038      42         1     0          1    6      1  2003
878039      40         1     0          1    6      1  2003
878040      33         1     0          1    6      1  2003
878041      31         1     0          1    6      1  2003
878042      20         0     0          1    6      1  2003
878043      20         0     0          1    6      1  2003
878044      15         0     0          1    6      1  2003
878045       1         0     0          1    6      1  2003
878046       1         0     0          1    6      1  2003
878047       1         0     0          1    6      1  2003
878048       1         0     0          1    6      1  2003

[878049 rows x 7 columns]

Create grid for sector analysis


In [11]:
# outliers are all at position X = -120.5, Y = 90

def filter_x(x):
    if (x > -122):
        return -122.4483364
    else: 
        return x
    
def filter_y(y):
    if y > 37.9:
        return 37.7563690
    else:
        return y

In [13]:
# take a look at the positions of our train data.
min_x_train = min([filter_x(x) for x in train_data["X"]]) 
max_x_train = max([filter_x(x) for x in train_data["X"]]) 
min_y_train = min([filter_y(y) for y in train_data["Y"]]) 
max_y_train = max([filter_y(y) for y in train_data["Y"]]) 
print("Min_X_train: ", min_x_train)
print("Max_X_train: ", max_x_train)
print("Min_Y_train: ", min_y_train)
print("Max_Y_train: ", max_y_train)


Min_X_train:  -122.513642064
Max_X_train:  -122.364937494
Min_Y_train:  37.7078790224
Max_Y_train:  37.8199754923

In [14]:
# take a look at the positions of our test data.
min_x_test = min([filter_x(x) for x in test_data["X"]]) 
max_x_test = max([filter_x(x) for x in test_data["X"]]) 
min_y_test = min([filter_y(y) for y in test_data["Y"]]) 
max_y_test = max([filter_y(y) for y in test_data["Y"]]) 
print("Min_X_test: ", min_x_test)
print("Max_X_test: ", max_x_test)
print("Min_Y_test: ", min_y_test)
print("Max_Y_test: ", max_y_test)


Min_X_test:  -122.513642064
Max_X_test:  -122.364750704
Min_Y_test:  37.7078790224
Max_Y_test:  37.8206208381

In [15]:
# Final coordinates for grid that covers San Francisco.
min_x = -122.53
max_x = -122.35
min_y = 37.65
max_y = 37.84

dif_x = max_x - min_x
dif_y = max_y - min_y

In [16]:
# grid functions

def get_subregion_pos(subregion_id, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    x = subregion_id % x_sections
    x_pos = ((x + 1/2) / x_sections) * dif_x + min_x
    y = subregion_id // x_sections
    y_pos = ((y + 1/2) / y_sections) * dif_y + min_y
    return (x_pos, y_pos)

def get_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
    x = pos_x - min_x
    x_sec = int(x_sections * x / dif_x)
    y = pos_y - min_y
    y_sec = int(y_sections * y / dif_y)
    return x_sec + x_sections * y_sec
    
def get_subregion_series(data, min_x, min_y, dif_x, dif_y):
    X_SECTIONS = 20
    Y_SECTIONS = 20
    subregion_list = []
    for i in range(len(data)):
        pos_x = data["X"][i]
        pos_y = data["Y"][i]
        subregion = get_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, X_SECTIONS, Y_SECTIONS)
        subregion_list.append(subregion)
    return pd.Series(subregion_list, name='subregion')

In [17]:
subregion_series = get_subregion_series(train_data, min_x, min_y, dif_x, dif_y)

In [19]:
# look at the numer of crimes in each subregion
subregion_series.value_counts()


Out[19]:
293       74546
292       53553
273       44346
272       43574
274       39301
252       30585
232       25023
294       18081
313       17712
251       16341
212       15606
271       15095
270       12688
175       12615
233       12410
314       12002
253       11647
291       11593
290       11016
195       10852
248       10356
230        9019
176        8894
332        8889
213        8388
269        8049
250        7950
150        7929
310        7860
311        7708
          ...  
285        1053
131        1050
286         965
188         951
244         785
135         769
243         757
228         752
284         751
152         696
124         696
189         577
156         569
163         553
125         433
261         384
143         263
216         232
162         195
326         110
144         108
357          93
110425       67
236          54
136          50
197          49
283          43
123          24
328           2
142           1
Name: subregion, dtype: int64

In [21]:
# highest crime rate around union square
get_subregion_pos(293, min_x, min_y, dif_x, dif_y, 20, 20)


Out[21]:
(-122.40849999999999, 37.78775)

police station one hot encoding


In [22]:
# generate one hot encoding of police destricts
one_hot_police_destricts = pd.get_dummies(train_data["PdDistrict"])

In [23]:
one_hot_police_destricts["NORTHERN"]


Out[23]:
0         1
1         1
2         1
3         1
4         0
5         0
6         0
7         0
8         0
9         0
10        0
11        0
12        0
13        1
14        0
15        0
16        0
17        0
18        0
19        0
20        0
21        0
22        0
23        0
24        1
25        0
26        1
27        0
28        0
29        0
         ..
878019    0
878020    1
878021    1
878022    0
878023    0
878024    0
878025    0
878026    0
878027    0
878028    0
878029    0
878030    0
878031    0
878032    1
878033    0
878034    0
878035    1
878036    1
878037    1
878038    0
878039    1
878040    0
878041    0
878042    0
878043    0
878044    0
878045    0
878046    0
878047    0
878048    0
Name: NORTHERN, dtype: uint8

crime distribution per subregion


In [49]:
regions = subregion_series.unique()
crimes = train_data['Category'].unique()

In [50]:
# count crimes in each region
criminal_activity_local = {}
criminal_activity_overall = train_data["Category"].value_counts()
for r in regions:
    criminal_activity_local[r] = {}
    criminal_activity_local[r]["N"] = 0
    for c in crimes:
        criminal_activity_local[r][c] = 0
for i, r in enumerate(subregion_series):
    criminal_activity_local[r][train_data["Category"][i]] += 1
    criminal_activity_local[r]["N"] += 1

In [51]:
# union square
criminal_activity_local[293]


Out[51]:
{'ARSON': 59,
 'ASSAULT': 6036,
 'BAD CHECKS': 25,
 'BRIBERY': 12,
 'BURGLARY': 2636,
 'DISORDERLY CONDUCT': 443,
 'DRIVING UNDER THE INFLUENCE': 86,
 'DRUG/NARCOTIC': 7971,
 'DRUNKENNESS': 461,
 'EMBEZZLEMENT': 171,
 'EXTORTION': 15,
 'FAMILY OFFENSES': 28,
 'FORGERY/COUNTERFEITING': 831,
 'FRAUD': 1863,
 'GAMBLING': 10,
 'KIDNAPPING': 144,
 'LARCENY/THEFT': 19036,
 'LIQUOR LAWS': 169,
 'LOITERING': 97,
 'MISSING PERSON': 773,
 'N': 74546,
 'NON-CRIMINAL': 8364,
 'OTHER OFFENSES': 9644,
 'PORNOGRAPHY/OBSCENE MAT': 2,
 'PROSTITUTION': 149,
 'RECOVERED VEHICLE': 106,
 'ROBBERY': 1961,
 'RUNAWAY': 9,
 'SECONDARY CODES': 512,
 'SEX OFFENSES FORCIBLE': 307,
 'SEX OFFENSES NON FORCIBLE': 3,
 'STOLEN PROPERTY': 405,
 'SUICIDE': 33,
 'SUSPICIOUS OCC': 2099,
 'TREA': 0,
 'TRESPASS': 1123,
 'VANDALISM': 1721,
 'VEHICLE THEFT': 1375,
 'WARRANTS': 5322,
 'WEAPON LAWS': 545}

In [52]:
# global crime distribution
distribution_global = {}
for c in crimes:
    distribution_global[c] = criminal_activity_overall[c] / len(train_data)
for c in distribution_global:
    print(c, distribution_global[c])


WARRANTS 0.0480770435363
OTHER OFFENSES 0.143707241851
LARCENY/THEFT 0.199191616869
VEHICLE THEFT 0.0612505680207
VANDALISM 0.0509367928214
NON-CRIMINAL 0.105123973719
ROBBERY 0.0261944378958
ASSAULT 0.0875532003339
WEAPON LAWS 0.00974319200865
BURGLARY 0.0418598506461
SUSPICIOUS OCC 0.0357770466113
DRUNKENNESS 0.00487444322583
FORGERY/COUNTERFEITING 0.0120824692016
DRUG/NARCOTIC 0.0614669568555
STOLEN PROPERTY 0.00517055426292
SECONDARY CODES 0.0113718027126
TRESPASS 0.00834349791413
MISSING PERSON 0.0295985759337
FRAUD 0.0189955230289
KIDNAPPING 0.00266613822235
RUNAWAY 0.00221627722371
DRIVING UNDER THE INFLUENCE 0.00258299935425
SEX OFFENSES FORCIBLE 0.00499744319508
PROSTITUTION 0.00852344231358
DISORDERLY CONDUCT 0.00491999877
ARSON 0.0017231384581
FAMILY OFFENSES 0.000559194304646
LIQUOR LAWS 0.00216730501373
BRIBERY 0.000329138806604
EMBEZZLEMENT 0.00132794411246
SUICIDE 0.000578555410917
LOITERING 0.0013951385401
SEX OFFENSES NON FORCIBLE 0.000168555513417
EXTORTION 0.000291555482667
GAMBLING 0.000166277736208
BAD CHECKS 0.000462388773292
TREA 6.833331625e-06
RECOVERED VEHICLE 0.00357383243988
PORNOGRAPHY/OBSCENE MAT 2.50555492917e-05

In [65]:
# local crime distribution
distribution_local = {}
sufficient_n = 500
for r in regions:
    distribution_local[r] = {}
    for c in crimes: 
        if criminal_activity_local[r]["N"] >= sufficient_n:
            distribution_local[r][c] = criminal_activity_local[r][c] / criminal_activity_local[r]["N"]
        else:
            distribution_local[r][c] = distribution_global[c]

In [72]:
# crime distribution at union square
print(distribution_local[293])


{'WARRANTS': 0.07139216054516674, 'OTHER OFFENSES': 0.1293697850991334, 'LARCENY/THEFT': 0.2553591071284844, 'VEHICLE THEFT': 0.018444986987900088, 'VANDALISM': 0.02308641644085531, 'NON-CRIMINAL': 0.11219917903039733, 'ROBBERY': 0.026305905078743325, 'ASSAULT': 0.08097013924288358, 'WEAPON LAWS': 0.007310922115204035, 'BURGLARY': 0.03536071687280337, 'SUSPICIOUS OCC': 0.028157111045528937, 'DRUNKENNESS': 0.00618410109194323, 'FORGERY/COUNTERFEITING': 0.011147479408687254, 'DRUG/NARCOTIC': 0.10692726638585572, 'STOLEN PROPERTY': 0.005432887076436026, 'SECONDARY CODES': 0.006868242427494433, 'TRESPASS': 0.015064523918117672, 'MISSING PERSON': 0.01036943632119765, 'FRAUD': 0.02499128055160572, 'KIDNAPPING': 0.0019316931827328093, 'RUNAWAY': 0.00012073082392080058, 'DRIVING UNDER THE INFLUENCE': 0.0011536500952432055, 'SEX OFFENSES FORCIBLE': 0.004118262549298419, 'PROSTITUTION': 0.0019987658626888097, 'DISORDERLY CONDUCT': 0.005942639444101628, 'ARSON': 0.0007914576234808038, 'FAMILY OFFENSES': 0.0003756070077536018, 'LIQUOR LAWS': 0.002267056582512811, 'BRIBERY': 0.00016097443189440078, 'EMBEZZLEMENT': 0.002293885654495211, 'SUICIDE': 0.0004426796877096021, 'LOITERING': 0.0013012099911464063, 'SEX OFFENSES NON FORCIBLE': 4.0243607973600196e-05, 'EXTORTION': 0.00020121803986800096, 'GAMBLING': 0.00013414535991200063, 'BAD CHECKS': 0.0003353633997800016, 'TREA': 0.0, 'RECOVERED VEHICLE': 0.0014219408150672069, 'PORNOGRAPHY/OBSCENE MAT': 2.682907198240013e-05}

In [70]:
sum(distribution_local[293]


Out[70]:
{'ARSON': 0.0007914576234808038,
 'ASSAULT': 0.08097013924288358,
 'BAD CHECKS': 0.0003353633997800016,
 'BRIBERY': 0.00016097443189440078,
 'BURGLARY': 0.03536071687280337,
 'DISORDERLY CONDUCT': 0.005942639444101628,
 'DRIVING UNDER THE INFLUENCE': 0.0011536500952432055,
 'DRUG/NARCOTIC': 0.10692726638585572,
 'DRUNKENNESS': 0.00618410109194323,
 'EMBEZZLEMENT': 0.002293885654495211,
 'EXTORTION': 0.00020121803986800096,
 'FAMILY OFFENSES': 0.0003756070077536018,
 'FORGERY/COUNTERFEITING': 0.011147479408687254,
 'FRAUD': 0.02499128055160572,
 'GAMBLING': 0.00013414535991200063,
 'KIDNAPPING': 0.0019316931827328093,
 'LARCENY/THEFT': 0.2553591071284844,
 'LIQUOR LAWS': 0.002267056582512811,
 'LOITERING': 0.0013012099911464063,
 'MISSING PERSON': 0.01036943632119765,
 'NON-CRIMINAL': 0.11219917903039733,
 'OTHER OFFENSES': 0.1293697850991334,
 'PORNOGRAPHY/OBSCENE MAT': 2.682907198240013e-05,
 'PROSTITUTION': 0.0019987658626888097,
 'RECOVERED VEHICLE': 0.0014219408150672069,
 'ROBBERY': 0.026305905078743325,
 'RUNAWAY': 0.00012073082392080058,
 'SECONDARY CODES': 0.006868242427494433,
 'SEX OFFENSES FORCIBLE': 0.004118262549298419,
 'SEX OFFENSES NON FORCIBLE': 4.0243607973600196e-05,
 'STOLEN PROPERTY': 0.005432887076436026,
 'SUICIDE': 0.0004426796877096021,
 'SUSPICIOUS OCC': 0.028157111045528937,
 'TREA': 0.0,
 'TRESPASS': 0.015064523918117672,
 'VANDALISM': 0.02308641644085531,
 'VEHICLE THEFT': 0.018444986987900088,
 'WARRANTS': 0.07139216054516674,
 'WEAPON LAWS': 0.007310922115204035}

In [ ]: