data_count_extractor provides helpful functions to process the data to get the counts of certain crime types divided by location and time. Time granularity can be set using day, year, or month.


In [4]:
import numpy as np
import pandas as pd

data_path = "data/Case_Data_from_San_Francisco_311__SF311_.csv"

def get_loc_time_counts(csv_path, time_interval, location_interval, x_interval, y_interval, n_bins):
    """
    Gets the counts for each category of time divided by pdDistrict and time.
    
    @params csv_path string to csv_file
    @params time string granularity of time split
        "d" : split by day/month/year
        "m" : split by month/year
        "y" : split by year
    @params location_interval float size of each inteval in location grid
    @params min_x, min_y x,y coordinate of where to start grid
        Note: this argument is required because the max/min of 311 may not 
            be the same as 911
    @return pd.Dataframe [datetime, location, category, count]
    """
    data = pd.read_csv(csv_path, low_memory=False)
    
    # map each point to a location interval
    x_min = x_interval[0]
    x_max = x_interval[1]
    x_bins = create_bins(min_x, max_x, n_bins)
    
    y_min = y_interval[0]
    y_max = y_interval[1]
    y_bins = create_bins(min_y, max_y, n_bins)
    
    xy_bins_col = data.apply(lambda row: get_xy(row['Point']))
    
    
    # map each point to a datetime interval
    
    
    for district_name, district_data in data.groupby("PdDistrict"):
        
        for date_time, date_time_data in district_data.groupby("DateTime"):
            crime_counts = date_time_data.groupby("Category").size()

            
def get_xy(point):
    """
    Gets the x_point from an input string point.
    
    @params point string '(x,y)' coordinate 
    @return (x,y) float
    """ 
    x_str, y_str = point.split(',')
    return float(x_str[1:]), float(y_str[:-1])


def create_bins(min_pt, max_pt, n):
    """
    Creates n equally spaced bins between min_pt and max_pt
    
    @params min_pt float min value
    @params max_pt float max value
    @params n number of bins to create
    @return np.array bin values
    """
    return np.linspace(min_pt, max_pt, n)

def get_bin(bins, val):
    """
    Determines which bin the input val falls into. Bins are represented 
    by an increasing np.array. Val is assigned to the highest bin whose 
    value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would 
    be assigned to bin 0.0, 0.75 would be assigned to 0.5)
    
    @params bins np.array of increasing values
    @params val float to bin
    @return bin that val belongs to
    """
    for bin in bins[::-1]: # iterate through bins in reverse order
        if val >= bin:
            return bin

In [19]:
data = pd.read_csv(data_path,
                   names=["CaseID", "Opened", "Closed", "Updated", "Status", "Responsible Agency",
                          "Category", "Request Type", "Request Details", "Address", 
                          "Supervisor District", "Neighborhood", "Point", "Source", "Media URL"],
                   na_values=['-', ''],
                   low_memory=False)

# drop NaN rows
data = data.dropna(axis=1, how='all')

In [21]:



Out[21]:
(1097196, 15)

In [28]:
x_interval = (-122.3051, -122.9431)
y_interval = (37.8001, 38.531)
n_bins = 10

# map each point to a location interval
x_min = x_interval[0]
x_max = x_interval[1]
x_bins = create_bins(x_min, x_max, n_bins)
    
y_min = y_interval[0]
y_max = y_interval[1]
y_bins = create_bins(y_min, y_max, n_bins)
    
xy_bins_col = data.apply(lambda row: get_xy(row['Point']), axis=1)


---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-28-68c22de3db63> in <module>()
     12 y_bins = create_bins(y_min, y_max, n_bins)
     13 
---> 14 xy_bins_col = data.apply(lambda row: get_xy(row['Point']), axis=1)

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in apply(self, func, axis, broadcast, raw, reduce, args, **kwds)
   3310                     if reduce is None:
   3311                         reduce = True
-> 3312                     return self._apply_standard(f, axis, reduce=reduce)
   3313             else:
   3314                 return self._apply_broadcast(f, axis)

/usr/lib/python2.7/dist-packages/pandas/core/frame.pyc in _apply_standard(self, func, axis, ignore_failures, reduce)
   3398             try:
   3399                 for i, v in enumerate(series_gen):
-> 3400                     results[i] = func(v)
   3401                     keys.append(v.name)
   3402             except Exception as e:

<ipython-input-28-68c22de3db63> in <lambda>(row)
     12 y_bins = create_bins(y_min, y_max, n_bins)
     13 
---> 14 xy_bins_col = data.apply(lambda row: get_xy(row['Point']), axis=1)

<ipython-input-6-935e73fcdf6f> in get_xy(point)
     49     @return (x,y) float
     50     """ 
---> 51     x_str, y_str = point.split(',')
     52     return float(x_str[1:]), float(y_str[:-1])
     53 

AttributeError: ("'float' object has no attribute 'split'", u'occurred at index 1097196')

In [30]:


In [ ]: