We are trying to find the correct range of times and locations to do our joins. In this notebook, I will be exploring different types of groupings of the 911 reports, and trying to pick one grouping that doesn't have too many or too few reports.

Note that I don't remove duplicates here since we are mapping (time_range, location_range) to {0, 1}, where 0 means no crime happened in this time_range and location_range, and 1 means one or more crimes happened. Duplicated data would not change the result here.



In [182]:

    
from pylab import *
%matplotlib inline
import pandas as pd

df = pd.read_csv("data/sfpd_incident_2014.csv",
                 names=['IncidntNum','Category','Descript','DayOfWeek','Date','Time','PdDistrict','Resolution','Location','X','Y'],
                 na_values=['-'])

df = df[1:] #the first row is a copy of the labels.



In [183]:

    
max_x_loc = df[['X']][1:].max()
min_x_loc = df[['X']][1:].min()
print "min X location: %f" % min_x_loc
print "max X location: %f" % max_x_loc

max_y_loc = df[['Y']][1:].max()
min_y_loc = df[['Y']][1:].min()
print "min Y location: %f" % min_y_loc
print "max Y location: %f" % max_y_loc

range_x = max_x_loc - min_x_loc
range_y = max_y_loc - min_y_loc
print "range X: %f" % range_x
print "range Y: %f" % range_y









    



min X location: -122.513518
max X location: -122.513023
min Y location: 37.708083
max Y location: 37.818038
range X: 0.000495
range Y: 0.109955



In [184]:

    
import numpy as np
num_loc_bins = 10 #create a grid to map locations on.  Size of grid is square of this number.
x_bins = np.arange(min_x_loc, max_x_loc, range_x/float(num_loc_bins))
y_bins = np.arange(min_y_loc, max_y_loc, range_y/float(num_loc_bins))
print x_bins
print y_bins









    



[-122.5135183  -122.51346877 -122.51341924 -122.51336971 -122.51332018
 -122.51327065 -122.51322112 -122.51317159 -122.51312206 -122.51307253]
[ 37.70808298  37.71907847  37.73007395  37.74106944  37.75206493
  37.76306042  37.77405591  37.78505139  37.79604688  37.80704237]



In [185]:

    
# maps each possible x or y location to a bin.  The bin is denoted by the lowest value in the bin,
# i.e. (The bin is round_down_x plus x_range)
def round_down_x(xloc):
    for bin in x_bins[::-1]: #iterate through x_bins in reverse order
        if xloc >= bin:
            return bin
def round_down_y(yloc):
    for bin in y_bins[::-1]: #iterate through y_bins in reverse order
        if yloc >= bin:
            return bin



In [214]:

    
dff = df.head()
xycol = dff.apply(lambda row: (round_down_x(row['X']), round_down_y(row['Y'])), axis=1)
#dff.append(xycol, ignore_index=True, axis=1)
dff = pd.concat([dff, xycol], axis=1, names='a')
dff.columns.values[-1] = 'XY'
dff = dff[[1,2,3,4,5,6,7,11]]
dff









    Out[214]:






  
    
      
      Category
      Descript
      DayOfWeek
      Date
      Time
      PdDistrict
      Resolution
      XY
    
  
  
    
      1
       NON-CRIMINAL
                     DEATH REPORT, CAUSE UNKNOWN
       Wednesday
       01/01/2014
       16:21
       TENDERLOIN
       NONE
       (-122.513072534, 37.8070423703)
    
    
      2
       NON-CRIMINAL
                     DEATH REPORT, CAUSE UNKNOWN
        Thursday
       01/02/2014
       02:00
          MISSION
       NONE
       (-122.513072534, 37.8070423703)
    
    
      3
       NON-CRIMINAL
                     DEATH REPORT, CAUSE UNKNOWN
        Thursday
       01/02/2014
       14:30
          BAYVIEW
       NONE
       (-122.513072534, 37.8070423703)
    
    
      4
       NON-CRIMINAL
                                      AIDED CASE
       Wednesday
       01/01/2014
       00:17
         SOUTHERN
       NONE
       (-122.513072534, 37.8070423703)
    
    
      5
          VANDALISM
       MALICIOUS MISCHIEF, VANDALISM OF VEHICLES
       Wednesday
       01/01/2014
       00:30
         SOUTHERN
       NONE
       (-122.513072534, 37.8070423703)



In [110]:

    
dff = df.head()
ct = 0
for row in dff.iterrows():
    x_loc = row[1][9]
    y_loc = row[1][10]
    print x_loc, y_loc
    ct += 1
    if ct > 10:
        break
dff









    



X Y
-122.413794408659 37.7847721323318
-122.438234555172 37.7502026788808
-122.37192472361 37.7278984855968
-122.417565593086 37.7738921424314
-122.393966072273 37.7950278235052
-122.476399135614 37.7805000623007
-122.416293820935 37.7774936777647
-122.393493132081 37.7855854751061
-122.411961101789 37.7518080017934
-122.419225511064 37.7917052018228






    Out[110]:






  
    
      
      IncidntNum
      Category
      Descript
      DayOfWeek
      Date
      Time
      PdDistrict
      Resolution
      Location
      X
      Y
    
  
  
    
      0
       IncidntNum
           Category
                          Descript
       DayOfWeek
             Date
        Time
       PdDistrict
       Resolution
                         Location
                       X
                      Y
    
    
      1
        140001966
       NON-CRIMINAL
       DEATH REPORT, CAUSE UNKNOWN
       Wednesday
       01/01/2014
       16:21
       TENDERLOIN
             NONE
          400.0 Block of ELLIS ST
       -122.413794408659
       37.7847721323318
    
    
      2
        140003025
       NON-CRIMINAL
       DEATH REPORT, CAUSE UNKNOWN
        Thursday
       01/02/2014
       02:00
          MISSION
             NONE
         500.0 Block of JERSEY ST
       -122.438234555172
       37.7502026788808
    
    
      3
        140004487
       NON-CRIMINAL
       DEATH REPORT, CAUSE UNKNOWN
        Thursday
       01/02/2014
       14:30
          BAYVIEW
             NONE
          100.0 Block of CORAL CT
        -122.37192472361
       37.7278984855968
    
    
      4
        140000059
       NON-CRIMINAL
                        AIDED CASE
       Wednesday
       01/01/2014
       00:17
         SOUTHERN
             NONE
       1500.0 Block of MISSION ST
       -122.417565593086
       37.7738921424314



In [ ]:

	Category	Descript	DayOfWeek	Date	Time	PdDistrict	Resolution	XY
1	NON-CRIMINAL	DEATH REPORT, CAUSE UNKNOWN	Wednesday	01/01/2014	16:21	TENDERLOIN	NONE	(-122.513072534, 37.8070423703)
2	NON-CRIMINAL	DEATH REPORT, CAUSE UNKNOWN	Thursday	01/02/2014	02:00	MISSION	NONE	(-122.513072534, 37.8070423703)
3	NON-CRIMINAL	DEATH REPORT, CAUSE UNKNOWN	Thursday	01/02/2014	14:30	BAYVIEW	NONE	(-122.513072534, 37.8070423703)
4	NON-CRIMINAL	AIDED CASE	Wednesday	01/01/2014	00:17	SOUTHERN	NONE	(-122.513072534, 37.8070423703)
5	VANDALISM	MALICIOUS MISCHIEF, VANDALISM OF VEHICLES	Wednesday	01/01/2014	00:30	SOUTHERN	NONE	(-122.513072534, 37.8070423703)

	IncidntNum	Category	Descript	DayOfWeek	Date	Time	PdDistrict	Resolution	Location	X	Y
0	IncidntNum	Category	Descript	DayOfWeek	Date	Time	PdDistrict	Resolution	Location	X	Y
1	140001966	NON-CRIMINAL	DEATH REPORT, CAUSE UNKNOWN	Wednesday	01/01/2014	16:21	TENDERLOIN	NONE	400.0 Block of ELLIS ST	-122.413794408659	37.7847721323318
2	140003025	NON-CRIMINAL	DEATH REPORT, CAUSE UNKNOWN	Thursday	01/02/2014	02:00	MISSION	NONE	500.0 Block of JERSEY ST	-122.438234555172	37.7502026788808
3	140004487	NON-CRIMINAL	DEATH REPORT, CAUSE UNKNOWN	Thursday	01/02/2014	14:30	BAYVIEW	NONE	100.0 Block of CORAL CT	-122.37192472361	37.7278984855968
4	140000059	NON-CRIMINAL	AIDED CASE	Wednesday	01/01/2014	00:17	SOUTHERN	NONE	1500.0 Block of MISSION ST	-122.417565593086	37.7738921424314