In [3]:
DATA_PATH = "data/data_911_with_dateTime.csv" # Make this the /path/to/the/data
import pandas as pd
data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()
Out[3]:
In [26]:
import numpy as np
In [27]:
#convert to datetime
data_911['DateTime'] = pd.to_datetime(data_911['DateTime'])
type(data_911.head()['DateTime'][0])
Out[27]:
In [28]:
#just some code to test something
mintime = data_911['DateTime'].min()
maxtime = data_911['DateTime'].max()
#np.linspace(0, (maxtime-mintime).total_seconds(), 100)
In [29]:
data_path = DATA_PATH
def get_loc_time_counts(csv_path, time_interval, location_interval, x_interval, y_interval, n_bins):
"""
Gets the counts for each category of time divided by pdDistrict and time.
@params csv_path string to csv_file
@params time string granularity of time split
"d" : split by day/month/year
"m" : split by month/year
"y" : split by year
@params location_interval float size of each inteval in location grid
@params min_x, min_y x,y coordinate of where to start grid
Note: this argument is required because the max/min of 311 may not
be the same as 911
@return pd.Dataframe [datetime, location, category, count]
"""
data = pd.read_csv(csv_path, low_memory=False)
# map each point to a location interval
x_min = x_interval[0]
x_max = x_interval[1]
x_bins = create_bins(min_x, max_x, n_bins)
y_min = y_interval[0]
y_max = y_interval[1]
y_bins = create_bins(min_y, max_y, n_bins)
xy_bins_col = data.apply(lambda row: get_xy(row['Point']))
# map each point to a datetime interval
for district_name, district_data in data.groupby("PdDistrict"):
for date_time, date_time_data in district_data.groupby("DateTime"):
crime_counts = date_time_data.groupby("Category").size()
def get_xy(point):
"""
Gets the x_point from an input string point.
@params point string '(x,y)' coordinate
@return (x,y) float
"""
x_str, y_str = point.split(',')
return float(x_str[1:]), float(y_str[:-1])
def create_bins(min_pt, max_pt, n):
"""
Creates n equally spaced bins between min_pt and max_pt
@params min_pt float min value
@params max_pt float max value
@params n number of bins to create
@return np.array bin values
"""
return np.linspace(min_pt, max_pt, n)
def get_bin(bins, val):
"""
Determines which bin the input val falls into. Bins are represented
by an increasing np.array. Val is assigned to the highest bin whose
value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would
be assigned to bin 0.0, 0.75 would be assigned to 0.5)
@params bins np.array of increasing values
@params val float to bin
@return bin that val belongs to
"""
for bin in bins[::-1]: # iterate through bins in reverse order
if val >= bin:
return bin
In [30]:
#testing something on a mini data set
minidata = data_911.head()
#minidata
#data_911
In [4]:
mintime = data_911['DateTime'].min()
maxtime = data_911['DateTime'].max()
print 'Maxtime:' + str(maxtime)
print "Mintime" + str(mintime)
2008-07-01 00:13:00
2014-11-29 00:57:11
In [5]:
pd.to_datetime('2014-11-29 00:57:11')
Out[5]:
In [31]:
#make time bin for 911 dataset
mintime = pd.to_datetime('2008-07-01 00:13:00')
maxtime = pd.to_datetime('2014-11-29 00:57:11')
print maxtime-mintime
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 1000)
data_911['TimeBin'] = data_911.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)
data_911.head()
Out[31]:
In [32]:
data_911.tail()
Out[32]:
In [32]:
In [33]:
max_x_loc = minidata['X'].max()
min_x_loc = minidata['X'].min()
print "min X location: %f" % min_x_loc
print "max X location: %f" % max_x_loc
max_y_loc = minidata['Y'].max()
min_y_loc = minidata['Y'].min()
print "min Y location: %f" % min_y_loc
print "max Y location: %f" % max_y_loc
In [34]:
max_x_loc = 37.881603
min_x_loc = 37.624394
print "min X location: %f" % min_x_loc
print "max X location: %f" % max_x_loc
max_y_loc = -119.850760
min_y_loc = -122.514436
print "min Y location: %f" % min_y_loc
print "max Y location: %f" % max_y_loc
In [35]:
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)
data_911['XBin'] = data_911.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_911['YBin'] = data_911.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_911 = data_911[['Category', 'Descript', 'PdDistrict', 'Resolution', 'TimeBin', 'XBin', 'YBin']]
In [36]:
#store into csv as checkpoint
data_911.to_csv('intermediate_911.csv', index_label=False)
In [37]:
#DATA_PATH = "/home/datascience/project/intermediate_911.csv" # Make this the /path/to/the/data
DATA_PATH = "/home/casual1water/cs194-16-data_manatees/intermediate_911.csv" # path of amazon machine
import pandas as pd
data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()
Out[37]:
In [38]:
agg_dict = {}
for index, row in data_911.iterrows():
tupe = (row['TimeBin'], row['XBin'], row['YBin'])
if tupe not in agg_dict:
agg_dict[tupe] = []
agg_dict[tupe].append(row.to_dict())
In [39]:
agg_dict2 = {}
agg_dict2['911-reports'] = []
agg_dict2['tuple'] = []
for k in agg_dict:
agg_dict2['911-reports'].append(agg_dict[k])
agg_dict2['tuple'].append(k)
In [40]:
df_mini = pd.DataFrame(agg_dict_mini2)
df_mini.head(1)['911-reports']
print (df_mini['911-reports'][0][0])
In [41]:
final_df = pd.DataFrame(agg_dict2)
final_df.to_csv("data_911_ready_to_join.csv", index_label=False)
In [ ]: