This notebook takes in the raw 311 data and the raw 911 data, and creates a dataframe with 3 columns:

set of 311-reports
set of 911-reports
(X, Y, loc) tuple that join the two reports based on location and time. The resulting dataframe will be written to joined_unfeaturized.csv. We can test different feature maps easily by mapping over the 311-reports/911-reports of that dataframe.



In [3]:

    
#Notebook for converting 311 data into format of [tuple, list of 311 reports that occured at that tuple
#Request Details is not used, and contains nan values, so we just drop that column here.
DATA_PATH = "data/data_311.csv" # path of amazon machine
import pandas as pd
data_311 = pd.read_csv(DATA_PATH, na_values=['-'])
data_311 = data_311.where((pd.notnull(data_311)), None)
#data_311 = data_311[['Opened', 'Category', 'Request Type', 'Request Details', 'Supervisor District', 'Point', 'Source']]
data_311 = data_311[['Opened', 'Category', 'Request Type', 'Supervisor District', 'Point']]
data_311.head()









    Out[3]:






  
    
      
      Opened
      Category
      Request Type
      Supervisor District
      Point
    
  
  
    
      0
       11/29/2014 12:57:11 AM
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
    
    
      1
       11/29/2014 12:32:49 AM
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
    
    
      2
       11/29/2014 12:22:50 AM
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
    
    
      3
       11/29/2014 12:21:33 AM
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
    
    
      4
       11/29/2014 12:16:54 AM
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)



In [4]:

    
import numpy as np
#drop all the blank rows at the end of the 311 dataset
data_311 = data_311.dropna(thresh=5)
data_311.tail(5)









    Out[4]:






  
    
      
      Opened
      Category
      Request Type
      Supervisor District
      Point
    
  
  
    
      1098661
       07/01/2008 03:03:00 AM
       Street and Sidewalk Cleaning
                  Sidewalk_Cleaning
       9
       (37.7406417656081, -122.423090064246)
    
    
      1098662
       07/01/2008 02:07:00 AM
                  Abandoned Vehicle
       Abandoned Vehicle - Car4door
       5
              (37.781800341, -122.428537476)
    
    
      1098663
       07/01/2008 01:56:00 AM
                   Tree Maintenance
               Trees - Damaged_Tree
       5
       (37.7643657242198, -122.458814894064)
    
    
      1098664
       07/01/2008 12:26:00 AM
       Street and Sidewalk Cleaning
                    Illegal_Dumping
       4
       (37.7617471571397, -122.475924197088)
    
    
      1098665
       07/01/2008 12:13:00 AM
       Street and Sidewalk Cleaning
                  Sidewalk_Cleaning
       3
        (37.800869062548, -122.406192162738)



In [5]:

    
#convert the time the 311 report was opened to a python datetime
data_311['DateTime'] = pd.to_datetime(data_311['Opened'])
data_311 = data_311.drop('Opened', 1)
data_311.head()









    Out[5]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54



In [6]:

    
def create_bins(min_pt, max_pt, n):
    """
    Creates n equally spaced bins between min_pt and max_pt
    
    @params min_pt float min value
    @params max_pt float max value
    @params n number of bins to create
    @return np.array bin values
    """
    return np.linspace(min_pt, max_pt, n)

import bisect
def get_bin(bins, val):
    """
    Determines which bin the input val falls into. Bins are represented 
    by an increasing np.array. Val is assigned to the highest bin whose 
    value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would 
    be assigned to bin 0.0, 0.75 would be assigned to 0.5)
    
    @params bins np.array of increasing values
    @params val float to bin
    @return bin that val belongs to
    """
    index = bisect.bisect_right(bins, val)-1 #bisect_left returns 2 with [0.0, 0.5, 1.0] and 0.75 as input
    return bins[index]



In [7]:

    
mintime = data_311['DateTime'].min()
maxtime = data_311['DateTime'].max()

print mintime
print maxtime









    



2008-07-01 00:13:00
2014-11-29 00:57:11



In [8]:

    
sec_range = (((maxtime-mintime).total_seconds())/5000)
#using 5000 time bins, each time bin is this many hours
print sec_range/3600









    



11.2417472778



In [9]:

    
#make time bin for 311 dataset
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 5000)
data_311['TimeBin'] = data_311.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)
data_311.head()









    Out[9]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
      TimeBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
       2.023515e+08
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
       2.023110e+08
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
       2.023110e+08
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
       2.023110e+08
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54
       2.023110e+08



In [10]:

    
print bins









    



[  0.00000000e+00   4.04783859e+04   8.09567718e+04 ...,   2.02270494e+08
   2.02310973e+08   2.02351451e+08]



In [11]:

    
#separate the point tuple into x and y columns
data_311['X'] = data_311.apply(lambda row: float(row['Point'].strip(')(').split(',')[1]), axis=1)
data_311['Y'] = data_311.apply(lambda row: float(row['Point'].strip(')(').split(',')[0]), axis=1)
data_311.head()









    Out[11]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
      TimeBin
      X
      Y
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
       2.023515e+08
      -122.416287
       37.782621
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
       2.023110e+08
      -122.471206
       37.771788
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
       2.023110e+08
      -122.440874
       37.777585
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
       2.023110e+08
      -122.394276
       37.789926
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54
       2.023110e+08
      -122.440874
       37.777585



In [11]:

    
# filter out all data that are in the top/bottom 1% of X or Y locations.
# This was done because we noticed there were some outliers in the X and Y data.
#min_x_loc = np.percentile(data_311['X'],1)  #without filter it was -122.514436
#max_x_loc = np.percentile(data_311['X'],99)  #without filter -119.850760
#min_y_loc = np.percentile(data_311['Y'],1)  #without filter 37.624394
#max_y_loc = np.percentile(data_311['Y'],99)  #without filter37.881603
#orig_len = len(data_311)



In [12]:

    
# find the min and max locations.  These values will be hard-coded into format_911.
# hard code these (like we do in the 911 data) to get rid of hidden precision
min_x_loc = -122.505716
max_x_loc = -122.384338
min_y_loc = 37.710652
max_y_loc = 37.803545
print "min x location: %f" % min_x_loc  #min x location: -122.505716
print "max x location: %f" % max_x_loc  #max x location: -122.384338
print "min y location: %f" % min_y_loc  #min y location: 37.710652
print "max y location: %f" % max_y_loc  #max y location: 37.803545









    



min x location: -122.505716
max x location: -122.384338
min y location: 37.710652
max y location: 37.803545



In [13]:

    
#print "original length : " + str(orig_len)
#print "original length * 96% : " + str(orig_len*.96)
data_311 = data_311[data_311['X']>=min_x_loc]
data_311 = data_311[data_311['X']<=max_x_loc]
data_311 = data_311[data_311['Y']>=min_y_loc]
data_311 = data_311[data_311['Y']<=max_y_loc]
print "after length : " + str(len(data_311)) #should go down about 4% (actually .98*.98)









    



after length : 1055187



In [13]:

    
# find the min and max locations.  These values will be hard-coded into format_911.
#print "min x location: %f" % min_x_loc  #min x location: -122.505716
#print "max x location: %f" % max_x_loc  #max x location: -122.384338
#print "min y location: %f" % min_y_loc  #min y location: 37.710652
#print "max y location: %f" % max_y_loc  #max y location: 37.803545
# hard code these (like we do in the 911 data) to get rid of hidden precision
#min_x_loc = -122.505716
#max_x_loc = -122.384338
#min_y_loc = 37.710652
#max_y_loc = 37.803545









    



min x location: -122.505716
max x location: -122.384338
min y location: 37.710652
max y location: 37.803545



In [14]:

    
#create xbins and ybins
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)
data_311['XBin'] = data_311.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_311['YBin'] = data_311.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_311.head()









    Out[14]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
      TimeBin
      X
      Y
      XBin
      YBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
       2.023515e+08
      -122.416287
       37.782621
      -122.424797
       37.772581
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
       2.023110e+08
      -122.471206
       37.771788
      -122.478743
       37.762259
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
       2.023110e+08
      -122.440874
       37.777585
      -122.451770
       37.772581
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
       2.023110e+08
      -122.394276
       37.789926
      -122.397824
       37.782902
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54
       2.023110e+08
      -122.440874
       37.777585
      -122.451770
       37.772581



In [15]:

    
#we should have timebin, xbin, and ybin at this point.  
#Get rid of DateTime, point, X, and Y (the columns that we used to generate these bins).
data_311 = data_311[['Category','Request Type', 'Supervisor District', 'TimeBin','XBin','YBin']]
data_311.head()









    Out[15]:






  
    
      
      Category
      Request Type
      Supervisor District
      TimeBin
      XBin
      YBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
       2.023515e+08
      -122.424797
       37.772581
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
       2.023110e+08
      -122.478743
       37.762259
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       2.023110e+08
      -122.397824
       37.782902
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581



In [16]:

    
#store into csv as checkpoint
data_311.to_csv('intermediate_3112.csv', index_label=False)



In [17]:

    
DATA_PATH = "intermediate_3112.csv" # path of amazon machine
import pandas as pd
data_311 = pd.read_csv(DATA_PATH, na_values=['-'])
data_311.head()









    Out[17]:






  
    
      
      Category
      Request Type
      Supervisor District
      TimeBin
      XBin
      YBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
       2.023515e+08
      -122.424797
       37.772581
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
       2.023110e+08
      -122.478743
       37.762259
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       2.023110e+08
      -122.397824
       37.782902
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581



In [18]:

    
#aggregate 311 reports by tuple
agg_dict_311 = {}
for index, row in data_311.iterrows():
    tupe = (row['TimeBin'], row['XBin'], row['YBin'])
    if tupe not in agg_dict_311:
        agg_dict_311[tupe] = []
    agg_dict_311[tupe].append(row.to_dict())
print len(agg_dict_311)



In [19]:

    
agg_dict_311_2 = {}
agg_dict_311_2['feature-311'] = []
agg_dict_311_2['tuple'] = []
for key in agg_dict_311:
    val = agg_dict_311[key]
    agg_dict_311_2['tuple'].append(key)
    agg_dict_311_2['feature-311'].append(val)
print "done"









    



done



In [20]:

    
df_311_final = pd.DataFrame(agg_dict_311_2)
df_311_final.head(5)









    Out[20]:






  
    
      
      feature-311
      tuple
    
  
  
    
      0
       [{u'Category': u'Sign Repair', u'TimeBin': 627...
       (6274149.81096, -122.424797333, 37.7725806667)
    
    
      1
       [{u'Category': u'311 External Request', u'Time...
       (68448950.5183, -122.451770222, 37.7622592222)
    
    
      2
       [{u'Category': u'Litter Receptacles', u'TimeBi...
          (180695514.556, -122.505716, 37.7519377778)
    
    
      3
       [{u'Category': u'Rec and Park Requests', u'Tim...
          (198060742.097, -122.505716, 37.7209734444)
    
    
      4
       [{u'Category': u'General Requests', u'TimeBin'...
         (166811428.2, -122.424797333, 37.7622592222)



In [21]:

    
# store a temporary result in case of crash
df_311_final.to_csv('df_311_ready_to_join_tmp.csv', index_label=False)



In [22]:

    
# now format the 911 data
DATA_PATH = "data/data_911_with_dateTime.csv" # Make this the /path/to/the/data
import pandas as pd
data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head(1)









    Out[22]:






  
    
      
      Category
      Descript
      DayOfWeek
      DateTime
      PdDistrict
      Resolution
      Location
      X
      Y
      TimeBin
    
  
  
    
      0
       ASSAULT
       BATTERY
       Thursday
       2009-01-01 00:58:00
       CENTRAL
       NONE
       200.0 Block of CLAY ST
      -122.398584
       37.795269
       0



In [23]:

    
import numpy as np
#convert to datetime
data_911['DateTime'] = pd.to_datetime(data_911['DateTime'])
type(data_911.head()['DateTime'][0])









    Out[23]:





pandas.tslib.Timestamp



In [24]:

    
#temp.  don't cut off extremes
#min_x_loc = -122.514436
#max_x_loc = -119.850760
#min_y_loc = 37.624394
#max_y_loc = 37.881603
#print "done"
min_x_loc = -122.505716
max_x_loc = -122.384338
min_y_loc = 37.710652
max_y_loc = 37.803545
print "min x location: %f" % min_x_loc  #min x location: -122.505716
print "max x location: %f" % max_x_loc  #max x location: -122.384338
print "min y location: %f" % min_y_loc  #min y location: 37.710652
print "max y location: %f" % max_y_loc  #max y location: 37.803545









    



min x location: -122.505716
max x location: -122.384338
min y location: 37.710652
max y location: 37.803545



In [25]:

    
#make time bin for 911 dataset
mintime = pd.to_datetime('2008-07-01 00:13:00')
maxtime = pd.to_datetime('2014-11-29 00:57:11')
print maxtime-mintime
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 5000)
data_911['TimeBin'] = data_911.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)
data_911.head()









    



2342 days, 0:44:11






    Out[25]:






  
    
      
      Category
      Descript
      DayOfWeek
      DateTime
      PdDistrict
      Resolution
      Location
      X
      Y
      TimeBin
    
  
  
    
      0
                      ASSAULT
                                               BATTERY
       Thursday
      2009-01-01 00:58:00
         CENTRAL
       NONE
            200.0 Block of CLAY ST
      -122.398584
       37.795269
       15867527.263853
    
    
      1
       SEX OFFENSES, FORCIBLE
                     ASSAULT TO RAPE WITH BODILY FORCE
       Thursday
      2009-01-01 01:00:00
        SOUTHERN
       NONE
          500.0 Block of HOWARD ST
      -122.396540
       37.788093
       15867527.263853
    
    
      2
                      ROBBERY
                             ROBBERY, ARMED WITH A GUN
       Thursday
      2009-01-01 01:00:00
        SOUTHERN
       NONE
          500.0 Block of HOWARD ST
      -122.396540
       37.788093
       15867527.263853
    
    
      3
                     BURGLARY
       BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY
       Thursday
      2009-01-01 01:00:00
       INGLESIDE
       NONE
       200.0 Block of BLYTHDALE AV
      -122.421717
       37.711308
       15867527.263853
    
    
      4
                LARCENY/THEFT
                        GRAND THEFT FROM UNLOCKED AUTO
       Thursday
      2009-01-01 01:00:00
         MISSION
       NONE
           3700.0 Block of 21ST ST
      -122.430888
       37.756204
       15867527.263853



In [26]:

    
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)

data_911['XBin'] = data_911.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_911['YBin'] = data_911.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_911 = data_911[['Category', 'Descript', 'PdDistrict', 'Resolution', 'TimeBin', 'XBin', 'YBin']]



In [27]:

    
xbins #should be array [-122.505716 ... -122.384338]









    Out[27]:





array([-122.505716  , -122.49222956, -122.47874311, -122.46525667,
       -122.45177022, -122.43828378, -122.42479733, -122.41131089,
       -122.39782444, -122.384338  ])



In [28]:

    
#store into csv as checkpoint
data_911.to_csv('intermediate_9112.csv', index_label=False)



In [29]:

    
#DATA_PATH = "/home/datascience/project/intermediate_911.csv" # Make this the /path/to/the/data

DATA_PATH = "intermediate_911.csv" # path of amazon machine

import pandas as pd

data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()









    Out[29]:






  
    
      
      Category
      Descript
      PdDistrict
      Resolution
      TimeBin
      XBin
      YBin
    
  
  
    
      0
                      ASSAULT
                                               BATTERY
         CENTRAL
       NONE
       15867527.263853
      -122.411311
       37.793224
    
    
      1
       SEX OFFENSES, FORCIBLE
                     ASSAULT TO RAPE WITH BODILY FORCE
        SOUTHERN
       NONE
       15867527.263853
      -122.397824
       37.782902
    
    
      2
                      ROBBERY
                             ROBBERY, ARMED WITH A GUN
        SOUTHERN
       NONE
       15867527.263853
      -122.397824
       37.782902
    
    
      3
                     BURGLARY
       BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY
       INGLESIDE
       NONE
       15867527.263853
      -122.424797
       37.710652
    
    
      4
                LARCENY/THEFT
                        GRAND THEFT FROM UNLOCKED AUTO
         MISSION
       NONE
       15867527.263853
      -122.438284
       37.751938



In [30]:

    
agg_dict = {} #maps (Time, X, Y) tuples to list of 911-reports
for index, row in data_911.iterrows():
    tupe = (row['TimeBin'], row['XBin'], row['YBin'])
    if tupe not in agg_dict:
        agg_dict[tupe] = []
    agg_dict[tupe].append(row.to_dict())
agg_dict2 = {} # dictionary to create dataframe with
agg_dict2['911-reports'] = []
agg_dict2['tuple'] = []
for k in agg_dict:
    agg_dict2['911-reports'].append(agg_dict[k])
    agg_dict2['tuple'].append(k)



In [31]:

    
(agg_dict2['911-reports'])[0:5]









    Out[31]:





[[{'Category': 'WARRANTS',
   'Descript': 'ENROUTE TO OUTSIDE JURISDICTION',
   'PdDistrict': 'PARK',
   'Resolution': 'ARREST, BOOKED',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'WARRANTS',
   'Descript': 'ENROUTE TO OUTSIDE JURISDICTION',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'FRAUD',
   'Descript': 'CREDIT CARD, THEFT BY USE OF',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT OF PROPERTY',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'ASSAULT',
   'Descript': 'THREATENING PHONE CALL(S)',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223}],
 [{'Category': 'MISSING PERSON',
   'Descript': 'MISSING JUVENILE',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 89821538.2614523,
   'XBin': -122.47874311111113,
   'YBin': 37.710652}],
 [{'Category': 'LARCENY/THEFT',
   'Descript': 'PETTY THEFT FROM LOCKED AUTO',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 198060742.09701943,
   'XBin': -122.50571599999999,
   'YBin': 37.72097344444445},
  {'Category': 'FRAUD',
   'Descript': 'CREDIT CARD, THEFT BY USE OF',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 198060742.09701943,
   'XBin': -122.50571599999999,
   'YBin': 37.72097344444445}],
 [{'Category': 'DRUG/NARCOTIC',
   'Descript': 'POSSESSION OF HALLUCINOGENIC',
   'PdDistrict': 'MISSION',
   'Resolution': 'ARREST, BOOKED',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'OTHER OFFENSES',
   'Descript': 'OPEN CONTAINER OF ALCOHOL IN VEHICLE',
   'PdDistrict': 'MISSION',
   'Resolution': 'ARREST, BOOKED',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'NON-CRIMINAL',
   'Descript': 'FOUND PROPERTY',
   'PdDistrict': 'MISSION',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'PETTY THEFT FROM LOCKED AUTO',
   'PdDistrict': 'MISSION',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'NON-CRIMINAL',
   'Descript': 'FOUND PROPERTY',
   'PdDistrict': 'MISSION',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'OTHER OFFENSES',
   'Descript': 'MISCELLANEOUS INVESTIGATION',
   'PdDistrict': 'SOUTHERN',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT FROM LOCKED AUTO',
   'PdDistrict': 'SOUTHERN',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT FROM UNLOCKED AUTO',
   'PdDistrict': 'SOUTHERN',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223}],
 [{'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT FROM LOCKED AUTO',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 125240125.90398079,
   'XBin': -122.49222955555555,
   'YBin': 37.710652}]]



In [40]:

    
df_911_final = pd.DataFrame(agg_dict2)
print df_911_final.head(2)
print len(df_911_final.iloc[0][0])









    



                                         911-reports  \
0  [{u'Category': u'WARRANTS', u'TimeBin': 684489...   
1  [{u'Category': u'MISSING PERSON', u'TimeBin': ...   

                                            tuple  
0  (68448950.5183, -122.451770222, 37.7622592222)  
1      (89821538.2615, -122.478743111, 37.710652)  
5



In [41]:

    
#store into csv before joins, in case we want to analyze these dataframes. 
df_311_final.to_csv('df_311_final.csv', index_label=False)
df_911_final.to_csv('df_911_final.csv', index_label=False)



In [33]:

    
import matplotlib as plt
#dir(plt)
print (df_911_final['911-reports'].apply(len)).describe()
print (df_311_final['feature-311'].apply(len)).describe()









    



count    216286.000000
mean          3.451892
std           4.181964
min           1.000000
25%           1.000000
50%           2.000000
75%           4.000000
max          47.000000
dtype: float64
count    249149.000000
mean          4.235169
std           5.218243
min           1.000000
25%           1.000000
50%           3.000000
75%           5.000000
max         148.000000
dtype: float64



In [45]:

    
print df_311_final.head(2)
print len(df_311_final)
print df_911_final.head(2)
print len(df_911_final)









    



                                         feature-311  \
0  [{u'Category': u'Sign Repair', u'TimeBin': 627...   
1  [{u'Category': u'311 External Request', u'Time...   

                                            tuple  
0  (6274149.81096, -122.424797333, 37.7725806667)  
1  (68448950.5183, -122.451770222, 37.7622592222)  
249147
                                         911-reports  \
0  [{u'Category': u'WARRANTS', u'TimeBin': 684489...   
1  [{u'Category': u'MISSING PERSON', u'TimeBin': ...   

                                            tuple  
0  (68448950.5183, -122.451770222, 37.7622592222)  
1      (89821538.2615, -122.478743111, 37.710652)  
216286



In [46]:

    
# join the dataframes.
# left-outer join on the tuples
joined_df = df_311_final.merge(df_911_final, how='left', on='tuple')
joined_df = joined_df.where((pd.notnull(joined_df)), 0)
joined_df['911-reports'] = joined_df['911-reports'].apply(lambda x: [] if x==0 else x)
print "done joining"









    



done joining



In [47]:

    
joined_df.info()









    



<class 'pandas.core.frame.DataFrame'>
Int64Index: 249147 entries, 0 to 249146
Data columns (total 3 columns):
feature-311    249147 non-null object
tuple          249147 non-null object
911-reports    249147 non-null object
dtypes: object(3)



In [48]:

    
joined_df.head(15)









    Out[48]:






  
    
      
      feature-311
      tuple
      911-reports
    
  
  
    
      0 
       [{u'Category': u'Sign Repair', u'TimeBin': 627...
       (6274149.81096, -122.424797333, 37.7725806667)
                                                      []
    
    
      1 
       [{u'Category': u'311 External Request', u'Time...
       (68448950.5183, -122.451770222, 37.7622592222)
       [{u'Category': u'WARRANTS', u'TimeBin': 684489...
    
    
      2 
       [{u'Category': u'Litter Receptacles', u'TimeBi...
          (180695514.556, -122.505716, 37.7519377778)
       [{u'Category': u'OTHER OFFENSES', u'TimeBin': ...
    
    
      3 
       [{u'Category': u'Rec and Park Requests', u'Tim...
          (198060742.097, -122.505716, 37.7209734444)
       [{u'Category': u'LARCENY/THEFT', u'TimeBin': 1...
    
    
      4 
       [{u'Category': u'General Requests', u'TimeBin'...
         (166811428.2, -122.424797333, 37.7622592222)
       [{u'Category': u'DRUG/NARCOTIC', u'TimeBin': 1...
    
    
      5 
       [{u'Category': u'General Requests', u'TimeBin'...
           (125240125.904, -122.492229556, 37.710652)
       [{u'Category': u'LARCENY/THEFT', u'TimeBin': 1...
    
    
      6 
       [{u'Category': u'Street and Sidewalk Cleaning'...
       (164827987.292, -122.411310889, 37.7519377778)
       [{u'Category': u'ASSAULT', u'TimeBin': 1648279...
    
    
      7 
       [{u'Category': u'Street and Sidewalk Cleaning'...
       (101115007.921, -122.424797333, 37.7829021111)
       [{u'Category': u'ASSAULT', u'TimeBin': 1011150...
    
    
      8 
       [{u'Category': u'Tree Maintenance', u'TimeBin'...
       (80592466.2815, -122.438283778, 37.7829021111)
                                                      []
    
    
      9 
       [{u'Category': u'General Requests', u'TimeBin'...
       (168268650.091, -122.438283778, 37.7725806667)
       [{u'Category': u'OTHER OFFENSES', u'TimeBin': ...
    
    
      10
       [{u'Category': u'Sidewalk or Curb', u'TimeBin'...
       (16110397.5791, -122.438283778, 37.7209734444)
       [{u'Category': u'FORGERY/COUNTERFEITING', u'Ti...
    
    
      11
       [{u'Category': u'Graffiti Public Property', u'...
       (194822471.227, -122.424797333, 37.7209734444)
       [{u'Category': u'OTHER OFFENSES', u'TimeBin': ...
    
    
      12
       [{u'Category': u'Damaged Property', u'TimeBin'...
       (166163774.026, -122.411310889, 37.7829021111)
       [{u'Category': u'BURGLARY', u'TimeBin': 166163...
    
    
      13
       [{u'Category': u'Tree Maintenance', u'TimeBin'...
       (135157330.444, -122.465256667, 37.7725806667)
       [{u'Category': u'LARCENY/THEFT', u'TimeBin': 1...
    
    
      14
       [{u'Category': u'Damaged Property', u'TimeBin'...
       (115444356.522, -122.397824444, 37.7725806667)
       [{u'Category': u'NON-CRIMINAL', u'TimeBin': 11...



In [37]:

    
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
counts_911_series = joined_df['911-reports'].apply(lambda x: 0 if not x else len(x))
print counts_911_series.describe()
counts_911_series.value_counts()









    



count    249149.000000
mean          0.218239
std           0.970596
min           0.000000
25%           0.000000
50%           0.000000
75%           0.000000
max          34.000000
dtype: float64






    Out[37]:





0     229985
1       6735
2       4179
3       2858
4       1917
5       1249
6        854
7        535
8        322
9        220
10       123
11        77
12        45
13        22
14        13
15         4
17         3
16         2
19         2
26         1
21         1
22         1
34         1
dtype: int64



In [49]:

    
#store joined dataframe into csv.
joined_df.to_csv('joined_unfeaturized.csv', index_label=False)

	Opened	Category	Request Type	Supervisor District	Point
0	11/29/2014 12:57:11 AM	SFHA Requests	SFHA Priority - Emergency	6	(37.782620592996, -122.416286644263)
1	11/29/2014 12:32:49 AM	Rec and Park Requests	Park - Neighborhood_Services_Area	1	(37.771787999, -122.4712056)
2	11/29/2014 12:22:50 AM	SFHA Requests	SFHA Priority - Emergency	5	(37.777584951111, -122.440874464132)
3	11/29/2014 12:21:33 AM	Streetlights	Streetlight - Light_Burnt_Out	6	(37.7899262286022, -122.394276396696)
4	11/29/2014 12:16:54 AM	SFHA Requests	SFHA Priority - Emergency	5	(37.777584951111, -122.440874464132)

	Opened	Category	Request Type	Supervisor District	Point
1098661	07/01/2008 03:03:00 AM	Street and Sidewalk Cleaning	Sidewalk_Cleaning	9	(37.7406417656081, -122.423090064246)
1098662	07/01/2008 02:07:00 AM	Abandoned Vehicle	Abandoned Vehicle - Car4door	5	(37.781800341, -122.428537476)
1098663	07/01/2008 01:56:00 AM	Tree Maintenance	Trees - Damaged_Tree	5	(37.7643657242198, -122.458814894064)
1098664	07/01/2008 12:26:00 AM	Street and Sidewalk Cleaning	Illegal_Dumping	4	(37.7617471571397, -122.475924197088)
1098665	07/01/2008 12:13:00 AM	Street and Sidewalk Cleaning	Sidewalk_Cleaning	3	(37.800869062548, -122.406192162738)

	Category	Request Type	Supervisor District	TimeBin	XBin	YBin
0	SFHA Requests	SFHA Priority - Emergency	6	2.023515e+08	-122.424797	37.772581
1	Rec and Park Requests	Park - Neighborhood_Services_Area	1	2.023110e+08	-122.478743	37.762259
2	SFHA Requests	SFHA Priority - Emergency	5	2.023110e+08	-122.451770	37.772581
3	Streetlights	Streetlight - Light_Burnt_Out	6	2.023110e+08	-122.397824	37.782902
4	SFHA Requests	SFHA Priority - Emergency	5	2.023110e+08	-122.451770	37.772581

	feature-311	tuple
0	[{u'Category': u'Sign Repair', u'TimeBin': 627...	(6274149.81096, -122.424797333, 37.7725806667)
1	[{u'Category': u'311 External Request', u'Time...	(68448950.5183, -122.451770222, 37.7622592222)
2	[{u'Category': u'Litter Receptacles', u'TimeBi...	(180695514.556, -122.505716, 37.7519377778)
3	[{u'Category': u'Rec and Park Requests', u'Tim...	(198060742.097, -122.505716, 37.7209734444)
4	[{u'Category': u'General Requests', u'TimeBin'...	(166811428.2, -122.424797333, 37.7622592222)

	Category	Descript	DayOfWeek	DateTime	PdDistrict	Resolution	Location	X	Y	TimeBin
0	ASSAULT	BATTERY	Thursday	2009-01-01 00:58:00	CENTRAL	NONE	200.0 Block of CLAY ST	-122.398584	37.795269	15867527.263853
1	SEX OFFENSES, FORCIBLE	ASSAULT TO RAPE WITH BODILY FORCE	Thursday	2009-01-01 01:00:00	SOUTHERN	NONE	500.0 Block of HOWARD ST	-122.396540	37.788093	15867527.263853
2	ROBBERY	ROBBERY, ARMED WITH A GUN	Thursday	2009-01-01 01:00:00	SOUTHERN	NONE	500.0 Block of HOWARD ST	-122.396540	37.788093	15867527.263853
3	BURGLARY	BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY	Thursday	2009-01-01 01:00:00	INGLESIDE	NONE	200.0 Block of BLYTHDALE AV	-122.421717	37.711308	15867527.263853
4	LARCENY/THEFT	GRAND THEFT FROM UNLOCKED AUTO	Thursday	2009-01-01 01:00:00	MISSION	NONE	3700.0 Block of 21ST ST	-122.430888	37.756204	15867527.263853