This notebook takes in the raw 311 data and the raw 911 data, and creates a dataframe with 3 columns:
In [3]:
#Notebook for converting 311 data into format of [tuple, list of 311 reports that occured at that tuple
#Request Details is not used, and contains nan values, so we just drop that column here.
DATA_PATH = "data/data_311.csv" # path of amazon machine
import pandas as pd
data_311 = pd.read_csv(DATA_PATH, na_values=['-'])
data_311 = data_311.where((pd.notnull(data_311)), None)
#data_311 = data_311[['Opened', 'Category', 'Request Type', 'Request Details', 'Supervisor District', 'Point', 'Source']]
data_311 = data_311[['Opened', 'Category', 'Request Type', 'Supervisor District', 'Point']]
data_311.head()
Out[3]:
In [4]:
import numpy as np
#drop all the blank rows at the end of the 311 dataset
data_311 = data_311.dropna(thresh=5)
data_311.tail(5)
Out[4]:
In [5]:
#convert the time the 311 report was opened to a python datetime
data_311['DateTime'] = pd.to_datetime(data_311['Opened'])
data_311 = data_311.drop('Opened', 1)
data_311.head()
Out[5]:
In [6]:
def create_bins(min_pt, max_pt, n):
"""
Creates n equally spaced bins between min_pt and max_pt
@params min_pt float min value
@params max_pt float max value
@params n number of bins to create
@return np.array bin values
"""
return np.linspace(min_pt, max_pt, n)
import bisect
def get_bin(bins, val):
"""
Determines which bin the input val falls into. Bins are represented
by an increasing np.array. Val is assigned to the highest bin whose
value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would
be assigned to bin 0.0, 0.75 would be assigned to 0.5)
@params bins np.array of increasing values
@params val float to bin
@return bin that val belongs to
"""
index = bisect.bisect_right(bins, val)-1 #bisect_left returns 2 with [0.0, 0.5, 1.0] and 0.75 as input
return bins[index]
In [7]:
mintime = data_311['DateTime'].min()
maxtime = data_311['DateTime'].max()
print mintime
print maxtime
In [8]:
sec_range = (((maxtime-mintime).total_seconds())/5000)
#using 5000 time bins, each time bin is this many hours
print sec_range/3600
In [9]:
#make time bin for 311 dataset
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 5000)
data_311['TimeBin'] = data_311.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)
data_311.head()
Out[9]:
In [10]:
print bins
In [11]:
#separate the point tuple into x and y columns
data_311['X'] = data_311.apply(lambda row: float(row['Point'].strip(')(').split(',')[1]), axis=1)
data_311['Y'] = data_311.apply(lambda row: float(row['Point'].strip(')(').split(',')[0]), axis=1)
data_311.head()
Out[11]:
In [11]:
# filter out all data that are in the top/bottom 1% of X or Y locations.
# This was done because we noticed there were some outliers in the X and Y data.
#min_x_loc = np.percentile(data_311['X'],1) #without filter it was -122.514436
#max_x_loc = np.percentile(data_311['X'],99) #without filter -119.850760
#min_y_loc = np.percentile(data_311['Y'],1) #without filter 37.624394
#max_y_loc = np.percentile(data_311['Y'],99) #without filter37.881603
#orig_len = len(data_311)
In [12]:
# find the min and max locations. These values will be hard-coded into format_911.
# hard code these (like we do in the 911 data) to get rid of hidden precision
min_x_loc = -122.505716
max_x_loc = -122.384338
min_y_loc = 37.710652
max_y_loc = 37.803545
print "min x location: %f" % min_x_loc #min x location: -122.505716
print "max x location: %f" % max_x_loc #max x location: -122.384338
print "min y location: %f" % min_y_loc #min y location: 37.710652
print "max y location: %f" % max_y_loc #max y location: 37.803545
In [13]:
#print "original length : " + str(orig_len)
#print "original length * 96% : " + str(orig_len*.96)
data_311 = data_311[data_311['X']>=min_x_loc]
data_311 = data_311[data_311['X']<=max_x_loc]
data_311 = data_311[data_311['Y']>=min_y_loc]
data_311 = data_311[data_311['Y']<=max_y_loc]
print "after length : " + str(len(data_311)) #should go down about 4% (actually .98*.98)
In [13]:
# find the min and max locations. These values will be hard-coded into format_911.
#print "min x location: %f" % min_x_loc #min x location: -122.505716
#print "max x location: %f" % max_x_loc #max x location: -122.384338
#print "min y location: %f" % min_y_loc #min y location: 37.710652
#print "max y location: %f" % max_y_loc #max y location: 37.803545
# hard code these (like we do in the 911 data) to get rid of hidden precision
#min_x_loc = -122.505716
#max_x_loc = -122.384338
#min_y_loc = 37.710652
#max_y_loc = 37.803545
In [14]:
#create xbins and ybins
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)
data_311['XBin'] = data_311.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_311['YBin'] = data_311.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_311.head()
Out[14]:
In [15]:
#we should have timebin, xbin, and ybin at this point.
#Get rid of DateTime, point, X, and Y (the columns that we used to generate these bins).
data_311 = data_311[['Category','Request Type', 'Supervisor District', 'TimeBin','XBin','YBin']]
data_311.head()
Out[15]:
In [16]:
#store into csv as checkpoint
data_311.to_csv('intermediate_3112.csv', index_label=False)
In [17]:
DATA_PATH = "intermediate_3112.csv" # path of amazon machine
import pandas as pd
data_311 = pd.read_csv(DATA_PATH, na_values=['-'])
data_311.head()
Out[17]:
In [18]:
#aggregate 311 reports by tuple
agg_dict_311 = {}
for index, row in data_311.iterrows():
tupe = (row['TimeBin'], row['XBin'], row['YBin'])
if tupe not in agg_dict_311:
agg_dict_311[tupe] = []
agg_dict_311[tupe].append(row.to_dict())
print len(agg_dict_311)
In [19]:
agg_dict_311_2 = {}
agg_dict_311_2['feature-311'] = []
agg_dict_311_2['tuple'] = []
for key in agg_dict_311:
val = agg_dict_311[key]
agg_dict_311_2['tuple'].append(key)
agg_dict_311_2['feature-311'].append(val)
print "done"
In [20]:
df_311_final = pd.DataFrame(agg_dict_311_2)
df_311_final.head(5)
Out[20]:
In [21]:
# store a temporary result in case of crash
df_311_final.to_csv('df_311_ready_to_join_tmp.csv', index_label=False)
In [22]:
# now format the 911 data
DATA_PATH = "data/data_911_with_dateTime.csv" # Make this the /path/to/the/data
import pandas as pd
data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head(1)
Out[22]:
In [23]:
import numpy as np
#convert to datetime
data_911['DateTime'] = pd.to_datetime(data_911['DateTime'])
type(data_911.head()['DateTime'][0])
Out[23]:
In [24]:
#temp. don't cut off extremes
#min_x_loc = -122.514436
#max_x_loc = -119.850760
#min_y_loc = 37.624394
#max_y_loc = 37.881603
#print "done"
min_x_loc = -122.505716
max_x_loc = -122.384338
min_y_loc = 37.710652
max_y_loc = 37.803545
print "min x location: %f" % min_x_loc #min x location: -122.505716
print "max x location: %f" % max_x_loc #max x location: -122.384338
print "min y location: %f" % min_y_loc #min y location: 37.710652
print "max y location: %f" % max_y_loc #max y location: 37.803545
In [25]:
#make time bin for 911 dataset
mintime = pd.to_datetime('2008-07-01 00:13:00')
maxtime = pd.to_datetime('2014-11-29 00:57:11')
print maxtime-mintime
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 5000)
data_911['TimeBin'] = data_911.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)
data_911.head()
Out[25]:
In [26]:
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)
data_911['XBin'] = data_911.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_911['YBin'] = data_911.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_911 = data_911[['Category', 'Descript', 'PdDistrict', 'Resolution', 'TimeBin', 'XBin', 'YBin']]
In [27]:
xbins #should be array [-122.505716 ... -122.384338]
Out[27]:
In [28]:
#store into csv as checkpoint
data_911.to_csv('intermediate_9112.csv', index_label=False)
In [29]:
#DATA_PATH = "/home/datascience/project/intermediate_911.csv" # Make this the /path/to/the/data
DATA_PATH = "intermediate_911.csv" # path of amazon machine
import pandas as pd
data_911 = pd.read_csv(DATA_PATH, na_values=['-'])
data_911 = data_911.where((pd.notnull(data_911)), None)
data_911.head()
Out[29]:
In [30]:
agg_dict = {} #maps (Time, X, Y) tuples to list of 911-reports
for index, row in data_911.iterrows():
tupe = (row['TimeBin'], row['XBin'], row['YBin'])
if tupe not in agg_dict:
agg_dict[tupe] = []
agg_dict[tupe].append(row.to_dict())
agg_dict2 = {} # dictionary to create dataframe with
agg_dict2['911-reports'] = []
agg_dict2['tuple'] = []
for k in agg_dict:
agg_dict2['911-reports'].append(agg_dict[k])
agg_dict2['tuple'].append(k)
In [31]:
(agg_dict2['911-reports'])[0:5]
Out[31]:
In [40]:
df_911_final = pd.DataFrame(agg_dict2)
print df_911_final.head(2)
print len(df_911_final.iloc[0][0])
In [41]:
#store into csv before joins, in case we want to analyze these dataframes.
df_311_final.to_csv('df_311_final.csv', index_label=False)
df_911_final.to_csv('df_911_final.csv', index_label=False)
In [33]:
import matplotlib as plt
#dir(plt)
print (df_911_final['911-reports'].apply(len)).describe()
print (df_311_final['feature-311'].apply(len)).describe()
In [45]:
print df_311_final.head(2)
print len(df_311_final)
print df_911_final.head(2)
print len(df_911_final)
In [46]:
# join the dataframes.
# left-outer join on the tuples
joined_df = df_311_final.merge(df_911_final, how='left', on='tuple')
joined_df = joined_df.where((pd.notnull(joined_df)), 0)
joined_df['911-reports'] = joined_df['911-reports'].apply(lambda x: [] if x==0 else x)
print "done joining"
In [47]:
joined_df.info()
In [48]:
joined_df.head(15)
Out[48]:
In [37]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
counts_911_series = joined_df['911-reports'].apply(lambda x: 0 if not x else len(x))
print counts_911_series.describe()
counts_911_series.value_counts()
Out[37]:
In [49]:
#store joined dataframe into csv.
joined_df.to_csv('joined_unfeaturized.csv', index_label=False)