data_count_extractor provides helpful functions to process the data to get the counts of certain crime types divided by location and time. Time granularity can be set using day, year, or month.
In [4]:
import numpy as np
import pandas as pd
data_path = "data/Case_Data_from_San_Francisco_311__SF311_.csv"
def get_loc_time_counts(csv_path, time_interval, location_interval, x_interval, y_interval, n_bins):
"""
Gets the counts for each category of time divided by pdDistrict and time.
@params csv_path string to csv_file
@params time string granularity of time split
"d" : split by day/month/year
"m" : split by month/year
"y" : split by year
@params location_interval float size of each inteval in location grid
@params min_x, min_y x,y coordinate of where to start grid
Note: this argument is required because the max/min of 311 may not
be the same as 911
@return pd.Dataframe [datetime, location, category, count]
"""
data = pd.read_csv(csv_path, low_memory=False)
# map each point to a location interval
x_min = x_interval[0]
x_max = x_interval[1]
x_bins = create_bins(min_x, max_x, n_bins)
y_min = y_interval[0]
y_max = y_interval[1]
y_bins = create_bins(min_y, max_y, n_bins)
xy_bins_col = data.apply(lambda row: get_xy(row['Point']))
# map each point to a datetime interval
for district_name, district_data in data.groupby("PdDistrict"):
for date_time, date_time_data in district_data.groupby("DateTime"):
crime_counts = date_time_data.groupby("Category").size()
def get_xy(point):
"""
Gets the x_point from an input string point.
@params point string '(x,y)' coordinate
@return (x,y) float
"""
x_str, y_str = point.split(',')
return float(x_str[1:]), float(y_str[:-1])
def create_bins(min_pt, max_pt, n):
"""
Creates n equally spaced bins between min_pt and max_pt
@params min_pt float min value
@params max_pt float max value
@params n number of bins to create
@return np.array bin values
"""
return np.linspace(min_pt, max_pt, n)
def get_bin(bins, val):
"""
Determines which bin the input val falls into. Bins are represented
by an increasing np.array. Val is assigned to the highest bin whose
value is less than val. (e.g. for bins [0.0, 0.5, 1.0], 0.25 would
be assigned to bin 0.0, 0.75 would be assigned to 0.5)
@params bins np.array of increasing values
@params val float to bin
@return bin that val belongs to
"""
for bin in bins[::-1]: # iterate through bins in reverse order
if val >= bin:
return bin
In [19]:
data = pd.read_csv(data_path,
names=["CaseID", "Opened", "Closed", "Updated", "Status", "Responsible Agency",
"Category", "Request Type", "Request Details", "Address",
"Supervisor District", "Neighborhood", "Point", "Source", "Media URL"],
na_values=['-', ''],
low_memory=False)
# drop NaN rows
data = data.dropna(axis=1, how='all')
In [21]:
Out[21]:
In [28]:
x_interval = (-122.3051, -122.9431)
y_interval = (37.8001, 38.531)
n_bins = 10
# map each point to a location interval
x_min = x_interval[0]
x_max = x_interval[1]
x_bins = create_bins(x_min, x_max, n_bins)
y_min = y_interval[0]
y_max = y_interval[1]
y_bins = create_bins(y_min, y_max, n_bins)
xy_bins_col = data.apply(lambda row: get_xy(row['Point']), axis=1)
In [30]:
In [ ]: