notebook.community

Edit and run

This notebook takes in the raw 311 data (from datasf.org) and converts it to a (feature vector, bin) dataframe, for joining with the 911 dataset.



In [2]:

    
#Notebook for converting 311 data into format of [tuple, list of 311 reports that occured at that tuple
DATA_PATH = "data/data_311.csv" # path of amazon machine
import pandas as pd
data_311 = pd.read_csv(DATA_PATH, na_values=['-'])
data_311 = data_311.where((pd.notnull(data_311)), None)
data_311 = data_311[['Opened', 'Category', 'Request Type', 'Supervisor District', 'Point']]
data_311.head()









    Out[2]:






  
    
      
      Opened
      Category
      Request Type
      Supervisor District
      Point
    
  
  
    
      0
       11/29/2014 12:57:11 AM
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
    
    
      1
       11/29/2014 12:32:49 AM
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
    
    
      2
       11/29/2014 12:22:50 AM
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
    
    
      3
       11/29/2014 12:21:33 AM
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
    
    
      4
       11/29/2014 12:16:54 AM
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)



In [3]:

    
import numpy as np
#drop all the blank rows at the end of the 311 dataset
data_311 = data_311.dropna(thresh=5)
data_311.tail()









    Out[3]:






  
    
      
      Opened
      Category
      Request Type
      Supervisor District
      Point
    
  
  
    
      1098661
       07/01/2008 03:03:00 AM
       Street and Sidewalk Cleaning
                  Sidewalk_Cleaning
       9
       (37.7406417656081, -122.423090064246)
    
    
      1098662
       07/01/2008 02:07:00 AM
                  Abandoned Vehicle
       Abandoned Vehicle - Car4door
       5
              (37.781800341, -122.428537476)
    
    
      1098663
       07/01/2008 01:56:00 AM
                   Tree Maintenance
               Trees - Damaged_Tree
       5
       (37.7643657242198, -122.458814894064)
    
    
      1098664
       07/01/2008 12:26:00 AM
       Street and Sidewalk Cleaning
                    Illegal_Dumping
       4
       (37.7617471571397, -122.475924197088)
    
    
      1098665
       07/01/2008 12:13:00 AM
       Street and Sidewalk Cleaning
                  Sidewalk_Cleaning
       3
        (37.800869062548, -122.406192162738)



In [4]:

    
#convert the time the 311 report was opened to a python datetime
data_311['DateTime'] = pd.to_datetime(data_311['Opened'])
data_311 = data_311.drop('Opened', 1)
data_311.head()









    Out[4]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54



In [5]:

    
def create_bins(min_pt, max_pt, n):
    """
    Creates n equally spaced bins between min_pt and max_pt
    
    @params min_pt float min value
    @params max_pt float max value
    @params n number of bins to create
    @return np.array bin values
    """
    return np.linspace(min_pt, max_pt, n)

import bisect
def get_bin(bins, val):
    """
    Determines which bin the input val falls into. Bins are represented 
    by an increasing np.array. Val is assigned to the highest bin whose 
    value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would 
    be assigned to bin 0.0, 0.75 would be assigned to 0.5)
    
    @params bins np.array of increasing values
    @params val float to bin
    @return bin that val belongs to
    """
    index = bisect.bisect_right(bins, val)-1 #bisect_left returns 2 with [0.0, 0.5, 1.0] and 0.75 as input
    return bins[index]



In [6]:

    
mintime = data_311['DateTime'].min()
maxtime = data_311['DateTime'].max()

print mintime
print maxtime









    



2008-07-01 00:13:00
2014-11-29 00:57:11



In [7]:

    
sec_range = (((maxtime-mintime).total_seconds())/5000)
#using 5000 time bins, each time bin is this many hours
print sec_range/3600









    



11.2417472778



In [8]:

    
#make time bin for 311 dataset
#subtracting 2 datetimes creates a tdelta object
bins = create_bins(0, (maxtime-mintime).total_seconds(), 5000)
data_311['TimeBin'] = data_311.apply(lambda row: get_bin(bins, (row['DateTime']-mintime).total_seconds()), axis = 1)
data_311.head()









    Out[8]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
      TimeBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
       2.023515e+08
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
       2.023110e+08
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
       2.023110e+08
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
       2.023110e+08
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54
       2.023110e+08



In [9]:

    
print bins









    



[  0.00000000e+00   4.04783859e+04   8.09567718e+04 ...,   2.02270494e+08
   2.02310973e+08   2.02351451e+08]



In [10]:

    
#separate the point tuple into x and y columns
data_311['X'] = data_311.apply(lambda row: float(row['Point'].strip(')(').split(',')[1]), axis=1)
data_311['Y'] = data_311.apply(lambda row: float(row['Point'].strip(')(').split(',')[0]), axis=1)
data_311.head()









    Out[10]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
      TimeBin
      X
      Y
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
       2.023515e+08
      -122.416287
       37.782621
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
       2.023110e+08
      -122.471206
       37.771788
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
       2.023110e+08
      -122.440874
       37.777585
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
       2.023110e+08
      -122.394276
       37.789926
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54
       2.023110e+08
      -122.440874
       37.777585



In [11]:

    
# filter out all data that are in the top/bottom 1% of X or Y locations.
# This was done because we noticed there were some outliers in the X and Y data.
#min_x_loc = np.percentile(data_311['X'],1)  #without filter it was -122.514436
#max_x_loc = np.percentile(data_311['X'],99)  #without filter -119.850760
#min_y_loc = np.percentile(data_311['Y'],1)  #without filter 37.624394
#max_y_loc = np.percentile(data_311['Y'],99)  #without filter37.881603
#orig_len = len(data_311)
#print "done"



In [11]:

    
# find the min and max locations.  These values will be hard-coded into format_911.
# hard code these (like we do in the 911 data) to get rid of hidden precision
min_x_loc = -122.505716
max_x_loc = -122.384338
min_y_loc = 37.710652
max_y_loc = 37.803545
print "min x location: %f" % min_x_loc  #min x location: -122.505716
print "max x location: %f" % max_x_loc  #max x location: -122.384338
print "min y location: %f" % min_y_loc  #min y location: 37.710652
print "max y location: %f" % max_y_loc  #max y location: 37.803545









    



min x location: -122.505716
max x location: -122.384338
min y location: 37.710652
max y location: 37.803545



In [12]:

    
#print "original length : " + str(orig_len)
#print "original length * 96% : " + str(orig_len*.96)
data_311 = data_311[data_311['X']>=min_x_loc]
data_311 = data_311[data_311['X']<=max_x_loc]
data_311 = data_311[data_311['Y']>=min_y_loc]
data_311 = data_311[data_311['Y']<=max_y_loc]
print "after length : " + str(len(data_311)) #should go down about 4% (actually .98*.98)









    



after length : 1055187



In [14]:

    
#temp.  don't cut off extremes
#min_x_loc = -122.514436
#max_x_loc = -119.850760
#min_y_loc = 37.624394
#max_y_loc = 37.881603
#print "done"



In [13]:

    
#create xbins and ybins
xbins = create_bins(min_x_loc, max_x_loc, 10)
ybins = create_bins(min_y_loc, max_y_loc, 10)
data_311['XBin'] = data_311.apply(lambda row: get_bin(xbins, row['X']), axis = 1)
data_311['YBin'] = data_311.apply(lambda row: get_bin(ybins, row['Y']), axis = 1)
data_311.head()









    Out[13]:






  
    
      
      Category
      Request Type
      Supervisor District
      Point
      DateTime
      TimeBin
      X
      Y
      XBin
      YBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
        (37.782620592996, -122.416286644263)
      2014-11-29 00:57:11
       2.023515e+08
      -122.416287
       37.782621
      -122.424797
       37.772581
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
                (37.771787999, -122.4712056)
      2014-11-29 00:32:49
       2.023110e+08
      -122.471206
       37.771788
      -122.478743
       37.762259
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:22:50
       2.023110e+08
      -122.440874
       37.777585
      -122.451770
       37.772581
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       (37.7899262286022, -122.394276396696)
      2014-11-29 00:21:33
       2.023110e+08
      -122.394276
       37.789926
      -122.397824
       37.782902
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
        (37.777584951111, -122.440874464132)
      2014-11-29 00:16:54
       2.023110e+08
      -122.440874
       37.777585
      -122.451770
       37.772581



In [14]:

    
#we should have timebin, xbin, and ybin at this point.  
#Get rid of DateTime, point, X, and Y (the columns that we used to generate these bins).
data_311 = data_311[['Category','Request Type', 'Supervisor District', 'TimeBin','XBin','YBin']]
data_311.head()









    Out[14]:






  
    
      
      Category
      Request Type
      Supervisor District
      TimeBin
      XBin
      YBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
       2.023515e+08
      -122.424797
       37.772581
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
       2.023110e+08
      -122.478743
       37.762259
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       2.023110e+08
      -122.397824
       37.782902
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581



In [15]:

    
#store into csv as checkpoint
data_311.to_csv('intermediate_311.csv', index_label=False)



In [16]:

    
DATA_PATH = "intermediate_311.csv" # path of amazon machine
import pandas as pd
data_311 = pd.read_csv(DATA_PATH, na_values=['-'])
data_311.head()









    Out[16]:






  
    
      
      Category
      Request Type
      Supervisor District
      TimeBin
      XBin
      YBin
    
  
  
    
      0
               SFHA Requests
               SFHA Priority - Emergency
       6
       2.023515e+08
      -122.424797
       37.772581
    
    
      1
       Rec and Park Requests
       Park - Neighborhood_Services_Area
       1
       2.023110e+08
      -122.478743
       37.762259
    
    
      2
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581
    
    
      3
                Streetlights
           Streetlight - Light_Burnt_Out
       6
       2.023110e+08
      -122.397824
       37.782902
    
    
      4
               SFHA Requests
               SFHA Priority - Emergency
       5
       2.023110e+08
      -122.451770
       37.772581



In [17]:

    
#aggregate 311 reports by tuple
agg_dict_311 = {}
for index, row in data_311.iterrows():
    tupe = (row['TimeBin'], row['XBin'], row['YBin'])
    if tupe not in agg_dict_311:
        agg_dict_311[tupe] = []
    agg_dict_311[tupe].append(row.to_dict())
print len(agg_dict_311)



In [38]:

    
# Get all possible labels for certain categorical data, by looping through the data.
Category_set = set()
Request_Type_set = set()
#Request_Details_set = set()
Supervisor_District_set = set()
#Source_set = set()
ct=0
for row_tuple in data_311.iterrows():
    ct+=1
    if ct>1:
        break
    row = row_tuple[1] # a pandas Series
    print row
    Category = row['Category']
    Request_Type = row['Request Type']
    #Request_Details = row['Request Details']
    Supervisor_District = row['Supervisor District']
    #Source = row['Source']
    Category_set.add(Category)
    Request_Type_set.add(Request_Type)
    #Request_Details_set.add(Request_Details)
    Supervisor_District_set.add(Supervisor_District)
    #Source_set.add(Source)
print "done"









    



Category                           SFHA Requests
Request Type           SFHA Priority - Emergency
Supervisor District                            6
TimeBin                             2.023515e+08
XBin                                   -122.4248
YBin                                    37.77258
Name: 0, dtype: object
done



In [21]:

    
print len(Category_set)
#print len(Request_Type_set)
#print len(Request_Details_set)
print len(Supervisor_District_set)
#print len(Source_set)
print Category_set
print Supervisor_District_set
#print Source_set









    



27
12
set(['Streetlights', 'Abandoned Vehicle', 'Interdepartmental Request', 'MUNI Feedback', 'Graffiti Private Property', 'DPW Volunteer Programs', 'Tree Maintenance', 'Rec and Park Requests', 'Residential Building Request', 'Blocked Street or SideWalk', 'Sidewalk or Curb', 'Temporary Sign Request', 'Sewer Issues', '311 External Request', 'Litter Receptacles', 'Catch Basin Maintenance', 'Street Defects', 'Street and Sidewalk Cleaning', 'Color Curb', 'SFHA Requests', 'Construction Zone Permits', 'Graffiti Public Property', 'Illegal Postings', 'Damaged Property', 'Unpermitted Cab Complaint', 'General Requests', 'Sign Repair'])
set([0.0, 1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0, 10.0, 11.0])



In [23]:

    
# Generate a feature vector given a list of dictionaries.  Each dictionary represents one 311 report.
# Regularization should also go here
    # @param reports_311: a list of dictionaries.  Each dictionary is of the form
    # {'Category': string, 'Request Details': string, 'Request Type': string, 'Source': string, 'Supervisor District':string,
    #    'TimeBin': float, 'XBin':float, 'YBin':float}

#create map dictionaries that will map each one of the categories from the category sets to an index in feature_vector.
category_map = {}
supervisor_map = {}
source_map = {}
index = 0
for cat in Category_set:
    category_map[cat] = index
    index += 1 
for cat in Supervisor_District_set:
    supervisor_map[cat] = index
    index += 1
#for cat in Source_set:
#    source_map[cat] = index
#    index += 1
    
def generate_feature_vector(reports_311):
    # simple feature vector that is just sum of counts of 311 reports of each category 
    # from (Category, Supervisor District, and Source)
    feature_vector = []
    for i in xrange(index):
        feature_vector.append(0)
    for report in reports_311:
        # the first 27 features correspond to the number of 311 reports from each category from Category set
        # the next 12 features correspond to the number of 311 reports from each category from Supervisor_District set
        # the next 9 features correspond to the number of 311 reports from each category from Source set
        feature_vector[category_map[report["Category"]]]+=1
        feature_vector[supervisor_map[report["Supervisor District"]]]+=1
        #feature_vector[source_map[report["Source"]]]+=1
    return feature_vector
print "done"









    



done



In [39]:

    
print category_map
print supervisor_map









    



{'Streetlights': 0, 'Abandoned Vehicle': 1, 'Interdepartmental Request': 2, 'MUNI Feedback': 3, 'Graffiti Private Property': 4, 'DPW Volunteer Programs': 5, 'Tree Maintenance': 6, 'Construction Zone Permits': 20, 'Residential Building Request': 8, 'Blocked Street or SideWalk': 9, 'Sidewalk or Curb': 10, 'Temporary Sign Request': 11, 'Sewer Issues': 12, '311 External Request': 13, 'Litter Receptacles': 14, 'Catch Basin Maintenance': 15, 'Street Defects': 16, 'Street and Sidewalk Cleaning': 17, 'Color Curb': 18, 'SFHA Requests': 19, 'Rec and Park Requests': 7, 'Graffiti Public Property': 21, 'Illegal Postings': 22, 'Damaged Property': 23, 'Unpermitted Cab Complaint': 24, 'General Requests': 25, 'Sign Repair': 26}
{0.0: 27, 1.0: 28, 2.0: 29, 3.0: 30, 4.0: 31, 5.0: 32, 6.0: 33, 7.0: 34, 8.0: 35, 9.0: 36, 10.0: 37, 11.0: 38}



In [25]:

    
import math
agg_dict_311_2 = {}
agg_dict_311_2['feature-311'] = []
agg_dict_311_2['tuple'] = []
for key in agg_dict_311:
    val = agg_dict_311[key]
    feature_vector_311 = generate_feature_vector(val)
    agg_dict_311_2['tuple'].append(key)
    agg_dict_311_2['feature-311'].append(feature_vector_311)
print "done"









    



done



In [26]:

    
len(agg_dict_311_2['feature-311'])









    Out[26]:





249147



In [27]:

    
df_311_final = pd.DataFrame(agg_dict_311_2)
df_311_final.head(5)









    Out[27]:






  
    
      
      feature-311
      tuple
    
  
  
    
      0
       [0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
       (6274149.81096, -122.424797333, 37.7725806667)
    
    
      1
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, ...
       (68448950.5183, -122.451770222, 37.7622592222)
    
    
      2
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...
          (180695514.556, -122.505716, 37.7519377778)
    
    
      3
       [0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...
          (198060742.097, -122.505716, 37.7209734444)
    
    
      4
       [0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...
         (166811428.2, -122.424797333, 37.7622592222)



In [35]:

    
print df_311_final.iloc[0][0]
print len(df_311_final.iloc[0][0])









    



[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 2, 2, 0, 0, 0, 0, 0]
39



In [28]:

    
df_311_final.to_csv('df_311_ready_to_join.csv', index_label=False)



In [29]:

    
t0 = [x[0] for x in df_311_final['tuple']]
t0s = set(t0)
print len(t0s)

t1 = [x[1] for x in df_311_final['tuple']]
t1s = set(t1)
print len(t1s)

t2 = [x[2] for x in df_311_final['tuple']]
t2s = set(t2)
print len(t2s)

	Opened	Category	Request Type	Supervisor District	Point
0	11/29/2014 12:57:11 AM	SFHA Requests	SFHA Priority - Emergency	6	(37.782620592996, -122.416286644263)
1	11/29/2014 12:32:49 AM	Rec and Park Requests	Park - Neighborhood_Services_Area	1	(37.771787999, -122.4712056)
2	11/29/2014 12:22:50 AM	SFHA Requests	SFHA Priority - Emergency	5	(37.777584951111, -122.440874464132)
3	11/29/2014 12:21:33 AM	Streetlights	Streetlight - Light_Burnt_Out	6	(37.7899262286022, -122.394276396696)
4	11/29/2014 12:16:54 AM	SFHA Requests	SFHA Priority - Emergency	5	(37.777584951111, -122.440874464132)

	Opened	Category	Request Type	Supervisor District	Point
1098661	07/01/2008 03:03:00 AM	Street and Sidewalk Cleaning	Sidewalk_Cleaning	9	(37.7406417656081, -122.423090064246)
1098662	07/01/2008 02:07:00 AM	Abandoned Vehicle	Abandoned Vehicle - Car4door	5	(37.781800341, -122.428537476)
1098663	07/01/2008 01:56:00 AM	Tree Maintenance	Trees - Damaged_Tree	5	(37.7643657242198, -122.458814894064)
1098664	07/01/2008 12:26:00 AM	Street and Sidewalk Cleaning	Illegal_Dumping	4	(37.7617471571397, -122.475924197088)
1098665	07/01/2008 12:13:00 AM	Street and Sidewalk Cleaning	Sidewalk_Cleaning	3	(37.800869062548, -122.406192162738)

	Category	Request Type	Supervisor District	TimeBin	XBin	YBin
0	SFHA Requests	SFHA Priority - Emergency	6	2.023515e+08	-122.424797	37.772581
1	Rec and Park Requests	Park - Neighborhood_Services_Area	1	2.023110e+08	-122.478743	37.762259
2	SFHA Requests	SFHA Priority - Emergency	5	2.023110e+08	-122.451770	37.772581
3	Streetlights	Streetlight - Light_Burnt_Out	6	2.023110e+08	-122.397824	37.782902
4	SFHA Requests	SFHA Priority - Emergency	5	2.023110e+08	-122.451770	37.772581

	feature-311	tuple
0	[0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...	(6274149.81096, -122.424797333, 37.7725806667)
1	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 3, 0, ...	(68448950.5183, -122.451770222, 37.7622592222)
2	[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, ...	(180695514.556, -122.505716, 37.7519377778)
3	[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, ...	(198060742.097, -122.505716, 37.7209734444)
4	[0, 0, 0, 1, 4, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, ...	(166811428.2, -122.424797333, 37.7622592222)