In [1]:
DATA_PATH = "data/data_911_combined.csv" # Make this the /path/to/the/data
import numpy as np
import pandas as pd
data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()
data_911.info()
type(data_911.head()['DateTime'][0])


<class 'pandas.core.frame.DataFrame'>
Int64Index: 746596 entries, 0 to 746595
Data columns (total 9 columns):
Category      746596 non-null object
Descript      746596 non-null object
DayOfWeek     746596 non-null object
DateTime      746596 non-null object
PdDistrict    746596 non-null object
Resolution    746596 non-null object
Location      746596 non-null object
X             746596 non-null float64
Y             746596 non-null float64
dtypes: float64(2), object(7)
Out[1]:
str

In [2]:
#convert to datetime
data_911['DateTime'] = pd.to_datetime(data_911['DateTime'])
type(data_911.head()['DateTime'][0])


Out[2]:
pandas.tslib.Timestamp

In [3]:
def create_bins(min_pt, max_pt, n):
    """
    Creates n equally spaced bins between min_pt and max_pt
    
    @params min_pt float min value
    @params max_pt float max value
    @params n number of bins to create
    @return np.array bin values
    """
    return np.linspace(min_pt, max_pt, n)

import bisect
def get_bin(bins, val):
    """
    Determines which bin the input val falls into. Bins are represented 
    by an increasing np.array. Val is assigned to the highest bin whose 
    value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would 
    be assigned to bin 0.0, 0.75 would be assigned to 0.5)
    
    @params bins np.array of increasing values
    @params val float to bin
    @return bin that val belongs to
    """
    index = bisect.bisect_right(bins, val)-1 #bisect_left returns 2 with [0.0, 0.5, 1.0] and 0.75 as input
    return bins[index]

In [4]:
#temp.  don't cut off extremes
#min_x_loc = -122.514436
#max_x_loc = -119.850760
#min_y_loc = 37.624394
#max_y_loc = 37.881603
#print "done"
min_x_loc = -122.505716
max_x_loc = -122.384338
min_y_loc = 37.710652
max_y_loc = 37.803545
print "min x location: %f" % min_x_loc  #min x location: -122.505716
print "max x location: %f" % max_x_loc  #max x location: -122.384338
print "min y location: %f" % min_y_loc  #min y location: 37.710652
print "max y location: %f" % max_y_loc  #max y location: 37.803545


min x location: -122.505716
max x location: -122.384338
min y location: 37.710652
max y location: 37.803545

In [5]:
#make time bin for 911 dataset
mintime = pd.to_datetime('2008-07-01 00:13:00')
maxtime = pd.to_datetime('2014-11-29 00:57:11')
print maxtime-mintime
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 5000)
data_911['TimeBin'] = data_911.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)
data_911.head()


2342 days, 0:44:11
Out[5]:
Category Descript DayOfWeek DateTime PdDistrict Resolution Location X Y TimeBin
0 ASSAULT BATTERY Thursday 2009-01-01 00:58:00 CENTRAL NONE 200.0 Block of CLAY ST -122.398584 37.795269 15867527.263853
1 SEX OFFENSES, FORCIBLE ASSAULT TO RAPE WITH BODILY FORCE Thursday 2009-01-01 01:00:00 SOUTHERN NONE 500.0 Block of HOWARD ST -122.396540 37.788093 15867527.263853
2 ROBBERY ROBBERY, ARMED WITH A GUN Thursday 2009-01-01 01:00:00 SOUTHERN NONE 500.0 Block of HOWARD ST -122.396540 37.788093 15867527.263853
3 BURGLARY BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY Thursday 2009-01-01 01:00:00 INGLESIDE NONE 200.0 Block of BLYTHDALE AV -122.421717 37.711308 15867527.263853
4 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO Thursday 2009-01-01 01:00:00 MISSION NONE 3700.0 Block of 21ST ST -122.430888 37.756204 15867527.263853

In [6]:
#max_y_loc = 37.803545
#min_y_loc = 37.710652
#print "min y location: %f" % min_y_loc
#print "max y location: %f" % max_y_loc
#max_x_loc = -122.384338
#min_x_loc = -122.505716
#print "min x location: %f" % min_x_loc
#print "max x location: %f" % max_x_loc

In [7]:
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)

data_911['XBin'] = data_911.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_911['YBin'] = data_911.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_911 = data_911[['Category', 'Descript', 'PdDistrict', 'Resolution', 'TimeBin', 'XBin', 'YBin']]

In [8]:
xbins


Out[8]:
array([-122.505716  , -122.49222956, -122.47874311, -122.46525667,
       -122.45177022, -122.43828378, -122.42479733, -122.41131089,
       -122.39782444, -122.384338  ])

In [9]:
ybins


Out[9]:
array([ 37.710652  ,  37.72097344,  37.73129489,  37.74161633,
        37.75193778,  37.76225922,  37.77258067,  37.78290211,
        37.79322356,  37.803545  ])

In [10]:
#store into csv as checkpoint

data_911.to_csv('intermediate_911.csv', index_label=False)

In [11]:
#DATA_PATH = "/home/datascience/project/intermediate_911.csv" # Make this the /path/to/the/data

DATA_PATH = "intermediate_911.csv" # path of amazon machine

import pandas as pd

data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()


Out[11]:
Category Descript PdDistrict Resolution TimeBin XBin YBin
0 ASSAULT BATTERY CENTRAL NONE 15867527.263853 -122.411311 37.793224
1 SEX OFFENSES, FORCIBLE ASSAULT TO RAPE WITH BODILY FORCE SOUTHERN NONE 15867527.263853 -122.397824 37.782902
2 ROBBERY ROBBERY, ARMED WITH A GUN SOUTHERN NONE 15867527.263853 -122.397824 37.782902
3 BURGLARY BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY INGLESIDE NONE 15867527.263853 -122.424797 37.710652
4 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO MISSION NONE 15867527.263853 -122.438284 37.751938

In [11]:


In [12]:
agg_dict = {}
for index, row in data_911.iterrows():
    tupe = (row['TimeBin'], row['XBin'], row['YBin'])
    if tupe not in agg_dict:
        agg_dict[tupe] = []
    agg_dict[tupe].append(row.to_dict())

In [13]:
agg_dict2 = {}
agg_dict2['911-reports'] = []
agg_dict2['tuple'] = []
for k in agg_dict:
    agg_dict2['911-reports'].append(agg_dict[k])
    agg_dict2['tuple'].append(k)

In [14]:
(agg_dict2['911-reports'])[0:5]


Out[14]:
[[{'Category': 'WARRANTS',
   'Descript': 'ENROUTE TO OUTSIDE JURISDICTION',
   'PdDistrict': 'PARK',
   'Resolution': 'ARREST, BOOKED',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'WARRANTS',
   'Descript': 'ENROUTE TO OUTSIDE JURISDICTION',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'FRAUD',
   'Descript': 'CREDIT CARD, THEFT BY USE OF',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT OF PROPERTY',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223},
  {'Category': 'ASSAULT',
   'Descript': 'THREATENING PHONE CALL(S)',
   'PdDistrict': 'PARK',
   'Resolution': 'NONE',
   'TimeBin': 68448950.51830366,
   'XBin': -122.45177022222221,
   'YBin': 37.76225922222223}],
 [{'Category': 'MISSING PERSON',
   'Descript': 'MISSING JUVENILE',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 89821538.2614523,
   'XBin': -122.47874311111113,
   'YBin': 37.710652}],
 [{'Category': 'LARCENY/THEFT',
   'Descript': 'PETTY THEFT FROM LOCKED AUTO',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 198060742.09701943,
   'XBin': -122.50571599999999,
   'YBin': 37.72097344444445},
  {'Category': 'FRAUD',
   'Descript': 'CREDIT CARD, THEFT BY USE OF',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 198060742.09701943,
   'XBin': -122.50571599999999,
   'YBin': 37.72097344444445}],
 [{'Category': 'DRUG/NARCOTIC',
   'Descript': 'POSSESSION OF HALLUCINOGENIC',
   'PdDistrict': 'MISSION',
   'Resolution': 'ARREST, BOOKED',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'OTHER OFFENSES',
   'Descript': 'OPEN CONTAINER OF ALCOHOL IN VEHICLE',
   'PdDistrict': 'MISSION',
   'Resolution': 'ARREST, BOOKED',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'NON-CRIMINAL',
   'Descript': 'FOUND PROPERTY',
   'PdDistrict': 'MISSION',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'PETTY THEFT FROM LOCKED AUTO',
   'PdDistrict': 'MISSION',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'NON-CRIMINAL',
   'Descript': 'FOUND PROPERTY',
   'PdDistrict': 'MISSION',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'OTHER OFFENSES',
   'Descript': 'MISCELLANEOUS INVESTIGATION',
   'PdDistrict': 'SOUTHERN',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT FROM LOCKED AUTO',
   'PdDistrict': 'SOUTHERN',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223},
  {'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT FROM UNLOCKED AUTO',
   'PdDistrict': 'SOUTHERN',
   'Resolution': 'NONE',
   'TimeBin': 166811428.19983998,
   'XBin': -122.42479733333332,
   'YBin': 37.76225922222223}],
 [{'Category': 'LARCENY/THEFT',
   'Descript': 'GRAND THEFT FROM LOCKED AUTO',
   'PdDistrict': 'TARAVAL',
   'Resolution': 'NONE',
   'TimeBin': 125240125.90398079,
   'XBin': -122.49222955555555,
   'YBin': 37.710652}]]

In [15]:
def generate_feature_vector(reports_911):
    # simple feature vector that is just the count of 911 reports associated with a particular tuple
    feature_vector = len(reports_911)
    return feature_vector

In [16]:
agg_dict3 = {}
agg_dict3['tuple'] = []
agg_dict3['feature-911'] = []
for key in agg_dict:
    feature_vector = generate_feature_vector(agg_dict[key])
    agg_dict3['feature-911'].append(feature_vector)
    agg_dict3['tuple'].append(key)

In [17]:
(agg_dict3['feature-911'])[0:5]


Out[17]:
[5, 1, 2, 8, 1]

In [18]:
final_df = pd.DataFrame(agg_dict3)
final_df.head(2)
#final_df.to_csv("data_911_ready_to_join.csv", index_label=False)


Out[18]:
feature-911 tuple
0 5 (68448950.5183, -122.451770222, 37.7622592222)
1 1 (89821538.2615, -122.478743111, 37.710652)

In [19]:
len(final_df)


Out[19]:
216286

In [20]:
final_df.head(5)


Out[20]:
feature-911 tuple
0 5 (68448950.5183, -122.451770222, 37.7622592222)
1 1 (89821538.2615, -122.478743111, 37.710652)
2 2 (198060742.097, -122.505716, 37.7209734444)
3 8 (166811428.2, -122.424797333, 37.7622592222)
4 1 (125240125.904, -122.492229556, 37.710652)

In [21]:
final_df.to_csv("data_911_ready_to_join.csv", index_label=False)

In [22]:
t0 = [x[0] for x in final_df['tuple']]
t0s = set(t0)
print len(t0s)

t1 = [x[1] for x in final_df['tuple']]
t1s = set(t1)
print len(t1s)

t2 = [x[2] for x in final_df['tuple']]
t2s = set(t2)
print len(t2s)


4581
10
10

In [25]:
t2s


Out[25]:
{37.710652,
 37.72097344444445,
 37.73129488888889,
 37.74161633333333,
 37.75193777777777,
 37.76225922222223,
 37.77258066666667,
 37.78290211111111,
 37.793223555555564,
 37.803545}

In [23]:
data_311 = pd.read_csv('df_311_ready_to_join.csv', na_values=['-'])
data_311 = data_311.where((pd.notnull(data_311)), None)
data_311.head(15)


Out[23]:
feature-311 tuple
0 [{'Category': 'Sign Repair', 'TimeBin': 627414... (6274149.810962193, -122.42479733333332, 37.77...
1 [{'Category': '311 External Request', 'TimeBin... (68448950.51830366, -122.45177022222221, 37.76...
2 [{'Category': 'Litter Receptacles', 'TimeBin':... (180695514.55571115, -122.50571599999999, 37.7...
3 [{'Category': 'Rec and Park Requests', 'TimeBi... (198060742.09701943, -122.50571599999999, 37.7...
4 [{'Category': 'General Requests', 'TimeBin': 1... (166811428.19983998, -122.42479733333332, 37.7...
5 [{'Category': 'General Requests', 'TimeBin': 1... (125240125.90398079, -122.49222955555555, 37.7...
6 [{'Category': 'Street and Sidewalk Cleaning', ... (164827987.29185838, -122.41131088888888, 37.7...
7 [{'Category': 'Street and Sidewalk Cleaning', ... (101115007.92118424, -122.42479733333332, 37.7...
8 [{'Category': 'Tree Maintenance', 'TimeBin': 8... (80592466.28145629, -122.43828377777778, 37.78...
9 [{'Category': 'General Requests', 'TimeBin': 1... (168268650.0914183, -122.43828377777778, 37.77...
10 [{'Category': 'Sidewalk or Curb', 'TimeBin': 1... (16110397.579115825, -122.43828377777778, 37.7...
11 [{'Category': 'Graffiti Public Property', 'Tim... (194822471.22684535, -122.42479733333332, 37.7...
12 [{'Category': 'Damaged Property', 'TimeBin': 1... (166163774.02580518, -122.41131088888888, 37.7...
13 [{'Category': 'Tree Maintenance', 'TimeBin': 1... (135157330.44388878, -122.46525666666668, 37.7...
14 [{'Category': 'Damaged Property', 'TimeBin': 1... (115444356.52170436, -122.39782444444445, 37.7...

In [23]: