In [3]:
DATA_PATH = "data/data_911_with_dateTime.csv" # Make this the /path/to/the/data
import pandas as pd
data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()


Out[3]:
Category Descript DayOfWeek DateTime PdDistrict Resolution Location X Y TimeBin
0 ASSAULT BATTERY Thursday 2009-01-01 00:58:00 CENTRAL NONE 200.0 Block of CLAY ST -122.398584 37.795269 0
1 SEX OFFENSES, FORCIBLE ASSAULT TO RAPE WITH BODILY FORCE Thursday 2009-01-01 01:00:00 SOUTHERN NONE 500.0 Block of HOWARD ST -122.396540 37.788093 0
2 ROBBERY ROBBERY, ARMED WITH A GUN Thursday 2009-01-01 01:00:00 SOUTHERN NONE 500.0 Block of HOWARD ST -122.396540 37.788093 0
3 BURGLARY BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY Thursday 2009-01-01 01:00:00 INGLESIDE NONE 200.0 Block of BLYTHDALE AV -122.421717 37.711308 0
4 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO Thursday 2009-01-01 01:00:00 MISSION NONE 3700.0 Block of 21ST ST -122.430888 37.756204 0

In [26]:
import numpy as np

In [27]:
#convert to datetime
data_911['DateTime'] = pd.to_datetime(data_911['DateTime'])
type(data_911.head()['DateTime'][0])


Out[27]:
pandas.tslib.Timestamp

In [28]:
#just some code to test something

mintime = data_911['DateTime'].min()
maxtime = data_911['DateTime'].max()

#np.linspace(0, (maxtime-mintime).total_seconds(), 100)

In [29]:
data_path = DATA_PATH

def get_loc_time_counts(csv_path, time_interval, location_interval, x_interval, y_interval, n_bins):
    """
    Gets the counts for each category of time divided by pdDistrict and time.
    
    @params csv_path string to csv_file
    @params time string granularity of time split
        "d" : split by day/month/year
        "m" : split by month/year
        "y" : split by year
    @params location_interval float size of each inteval in location grid
    @params min_x, min_y x,y coordinate of where to start grid
        Note: this argument is required because the max/min of 311 may not 
            be the same as 911
    @return pd.Dataframe [datetime, location, category, count]
    """
    data = pd.read_csv(csv_path, low_memory=False)
    
    # map each point to a location interval
    x_min = x_interval[0]
    x_max = x_interval[1]
    x_bins = create_bins(min_x, max_x, n_bins)
    
    y_min = y_interval[0]
    y_max = y_interval[1]
    y_bins = create_bins(min_y, max_y, n_bins)
    
    xy_bins_col = data.apply(lambda row: get_xy(row['Point']))
    
    
    # map each point to a datetime interval
    
    
    for district_name, district_data in data.groupby("PdDistrict"):
        
        for date_time, date_time_data in district_data.groupby("DateTime"):
            crime_counts = date_time_data.groupby("Category").size()

            
def get_xy(point):
    """
    Gets the x_point from an input string point.
    
    @params point string '(x,y)' coordinate 
    @return (x,y) float
    """ 
    x_str, y_str = point.split(',')
    return float(x_str[1:]), float(y_str[:-1])


def create_bins(min_pt, max_pt, n):
    """
    Creates n equally spaced bins between min_pt and max_pt
    
    @params min_pt float min value
    @params max_pt float max value
    @params n number of bins to create
    @return np.array bin values
    """
    return np.linspace(min_pt, max_pt, n)

def get_bin(bins, val):
    """
    Determines which bin the input val falls into. Bins are represented 
    by an increasing np.array. Val is assigned to the highest bin whose 
    value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would 
    be assigned to bin 0.0, 0.75 would be assigned to 0.5)
    
    @params bins np.array of increasing values
    @params val float to bin
    @return bin that val belongs to
    """
    for bin in bins[::-1]: # iterate through bins in reverse order
        if val >= bin:
            return bin

In [30]:
#testing something on a mini data set
minidata = data_911.head()
#minidata
#data_911

In [4]:
mintime = data_911['DateTime'].min()
maxtime = data_911['DateTime'].max()
print 'Maxtime:' + str(maxtime)
print "Mintime" + str(mintime)


2008-07-01 00:13:00
2014-11-29 00:57:11


Maxtime:2014-11-16 22:00:00
Mintime2009-01-01 00:01:00

In [5]:
pd.to_datetime('2014-11-29 00:57:11')


Out[5]:
Timestamp('2014-11-29 00:57:11')

In [31]:
#make time bin for 911 dataset

mintime = pd.to_datetime('2008-07-01 00:13:00')
maxtime = pd.to_datetime('2014-11-29 00:57:11')
print maxtime-mintime
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 1000)

data_911['TimeBin'] = data_911.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)

data_911.head()


2145 days, 21:59:00
Out[31]:
Category Descript DayOfWeek DateTime PdDistrict Resolution Location X Y TimeBin
0 ASSAULT BATTERY Thursday 2009-01-01 00:58:00 CENTRAL NONE 200.0 Block of CLAY ST -122.398584 37.795269 0
1 SEX OFFENSES, FORCIBLE ASSAULT TO RAPE WITH BODILY FORCE Thursday 2009-01-01 01:00:00 SOUTHERN NONE 500.0 Block of HOWARD ST -122.396540 37.788093 0
2 ROBBERY ROBBERY, ARMED WITH A GUN Thursday 2009-01-01 01:00:00 SOUTHERN NONE 500.0 Block of HOWARD ST -122.396540 37.788093 0
3 BURGLARY BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY Thursday 2009-01-01 01:00:00 INGLESIDE NONE 200.0 Block of BLYTHDALE AV -122.421717 37.711308 0
4 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO Thursday 2009-01-01 01:00:00 MISSION NONE 3700.0 Block of 21ST ST -122.430888 37.756204 0

In [32]:
data_911.tail()


Out[32]:
Category Descript DayOfWeek DateTime PdDistrict Resolution Location X Y TimeBin
746591 LARCENY/THEFT GRAND THEFT FROM LOCKED AUTO Saturday 2014-11-15 18:45:00 SOUTHERN NONE 0.0 Block of 12TH ST -122.418650 37.773030 1.852215e+08
746592 NON-CRIMINAL LOST PROPERTY Friday 2014-11-14 19:12:00 SOUTHERN NONE MARKET ST / 3RD ST -122.403434 37.787643 1.852215e+08
746593 NON-CRIMINAL LOST PROPERTY Friday 2014-11-14 23:10:00 SOUTHERN NONE 700.0 Block of HARRISON ST -122.397715 37.782250 1.852215e+08
746594 OTHER OFFENSES TRAFFIC VIOLATION Tuesday 2014-11-04 15:18:00 TARAVAL ARREST, BOOKED HOLLOWAY AV / VARELA AV -122.476069 37.720779 1.842936e+08
746595 DRUNKENNESS UNDER INFLUENCE OF ALCOHOL IN A PUBLIC PLACE Wednesday 2014-11-05 20:01:00 TARAVAL NONE 1600.0 Block of HOLLOWAY AV -122.477040 37.720921 1.842936e+08

In [32]:


In [33]:
max_x_loc = minidata['X'].max()
min_x_loc = minidata['X'].min()
print "min X location: %f" % min_x_loc
print "max X location: %f" % max_x_loc

max_y_loc = minidata['Y'].max()
min_y_loc = minidata['Y'].min()
print "min Y location: %f" % min_y_loc
print "max Y location: %f" % max_y_loc


min X location: -122.430888
max X location: -122.396540
min Y location: 37.711308
max Y location: 37.795269

In [34]:
max_x_loc = 37.881603
min_x_loc = 37.624394
print "min X location: %f" % min_x_loc
print "max X location: %f" % max_x_loc

max_y_loc = -119.850760
min_y_loc = -122.514436
print "min Y location: %f" % min_y_loc
print "max Y location: %f" % max_y_loc


min X location: -122.513518
max X location: -122.365246
min Y location: 37.708016
max Y location: 37.818226

In [35]:
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)

data_911['XBin'] = data_911.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_911['YBin'] = data_911.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_911 = data_911[['Category', 'Descript', 'PdDistrict', 'Resolution', 'TimeBin', 'XBin', 'YBin']]

In [36]:
#store into csv as checkpoint

data_911.to_csv('intermediate_911.csv', index_label=False)

In [37]:
#DATA_PATH = "/home/datascience/project/intermediate_911.csv" # Make this the /path/to/the/data

DATA_PATH = "/home/casual1water/cs194-16-data_manatees/intermediate_911.csv" # path of amazon machine

import pandas as pd

data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()


Out[37]:
Category Descript PdDistrict Resolution TimeBin XBin YBin
0 ASSAULT BATTERY CENTRAL NONE 0 -122.414670 37.793735
1 SEX OFFENSES, FORCIBLE ASSAULT TO RAPE WITH BODILY FORCE SOUTHERN NONE 0 -122.398196 37.781490
2 ROBBERY ROBBERY, ARMED WITH A GUN SOUTHERN NONE 0 -122.398196 37.781490
3 BURGLARY BURGLARY OF APARTMENT HOUSE, ATT FORCIBLE ENTRY INGLESIDE NONE 0 -122.431145 37.708016
4 LARCENY/THEFT GRAND THEFT FROM UNLOCKED AUTO MISSION NONE 0 -122.431145 37.744753

In [38]:
agg_dict = {}
for index, row in data_911.iterrows():
    tupe = (row['TimeBin'], row['XBin'], row['YBin'])
    if tupe not in agg_dict:
        agg_dict[tupe] = []
    agg_dict[tupe].append(row.to_dict())

In [39]:
agg_dict2 = {}
agg_dict2['911-reports'] = []
agg_dict2['tuple'] = []
for k in agg_dict:
    agg_dict2['911-reports'].append(agg_dict[k])
    agg_dict2['tuple'].append(k)

In [40]:
df_mini = pd.DataFrame(agg_dict_mini2)
df_mini.head(1)['911-reports']
print (df_mini['911-reports'][0][0])


---------------------------------------------------------------------------
NameError                                 Traceback (most recent call last)
<ipython-input-40-72472e911c7a> in <module>()
----> 1 df_mini = pd.DataFrame(agg_dict_mini2)
      2 df_mini.head(1)['911-reports']
      3 print (df_mini['911-reports'][0][0])

NameError: name 'agg_dict_mini2' is not defined

In [41]:
final_df = pd.DataFrame(agg_dict2)
final_df.to_csv("data_911_ready_to_join.csv", index_label=False)

In [ ]: