In [1]:
import pandas as pd
import numpy as np
import pprint
import requests
In [2]:
train_data = pd.read_csv("../../data/raw/train.csv")
train_data['Dates'] = pd.to_datetime(train_data['Dates'])
test_data = pd.read_csv("../../data/raw/test.csv")
test_data['Dates'] = pd.to_datetime(test_data['Dates'])
In [3]:
len(train_data)
Out[3]:
In [4]:
train_data.head()
Out[4]:
In [5]:
crimes = train_data['Category'].unique()
pprint.pprint("Crimes: {}, #{}".format(crimes, len(crimes)), indent=2)
In [6]:
train_data['Category'].value_counts()
Out[6]:
In [7]:
def get_halfhour(minute):
if minute < 30:
return 0
else:
return 1
def get_daynight(hour):
if 5 < hour and hour < 23:
return 0
else:
return 1
def generate_time_features(times):
minute_series = pd.Series([x.minute for x in times], name='minute')
halfhour_series = pd.Series([get_halfhour(x.minute) for x in times], name='halfhour')
hour_series = pd.Series([x.hour for x in times], name='hour')
daynight_series = pd.Series([get_daynight(x.hour) for x in times], name='day_night')
day_series = pd.Series([x.day for x in times], name='day')
month_series = pd.Series([x.month for x in times], name='month')
year_series = pd.Series([x.year for x in times], name='year')
time_features = pd.concat([minute_series, halfhour_series, hour_series, daynight_series, day_series, month_series, year_series], axis=1)
return time_features
In [8]:
times = train_data["Dates"]
In [9]:
time_features = generate_time_features(times)
print("success")
In [10]:
print(time_features)
In [11]:
# outliers are all at position X = -120.5, Y = 90
def filter_x(x):
if (x > -122):
return -122.4483364
else:
return x
def filter_y(y):
if y > 37.9:
return 37.7563690
else:
return y
In [13]:
# take a look at the positions of our train data.
min_x_train = min([filter_x(x) for x in train_data["X"]])
max_x_train = max([filter_x(x) for x in train_data["X"]])
min_y_train = min([filter_y(y) for y in train_data["Y"]])
max_y_train = max([filter_y(y) for y in train_data["Y"]])
print("Min_X_train: ", min_x_train)
print("Max_X_train: ", max_x_train)
print("Min_Y_train: ", min_y_train)
print("Max_Y_train: ", max_y_train)
In [14]:
# take a look at the positions of our test data.
min_x_test = min([filter_x(x) for x in test_data["X"]])
max_x_test = max([filter_x(x) for x in test_data["X"]])
min_y_test = min([filter_y(y) for y in test_data["Y"]])
max_y_test = max([filter_y(y) for y in test_data["Y"]])
print("Min_X_test: ", min_x_test)
print("Max_X_test: ", max_x_test)
print("Min_Y_test: ", min_y_test)
print("Max_Y_test: ", max_y_test)
In [15]:
# Final coordinates for grid that covers San Francisco.
min_x = -122.53
max_x = -122.35
min_y = 37.65
max_y = 37.84
dif_x = max_x - min_x
dif_y = max_y - min_y
In [16]:
# grid functions
def get_subregion_pos(subregion_id, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
x = subregion_id % x_sections
x_pos = ((x + 1/2) / x_sections) * dif_x + min_x
y = subregion_id // x_sections
y_pos = ((y + 1/2) / y_sections) * dif_y + min_y
return (x_pos, y_pos)
def get_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, x_sections, y_sections):
x = pos_x - min_x
x_sec = int(x_sections * x / dif_x)
y = pos_y - min_y
y_sec = int(y_sections * y / dif_y)
return x_sec + x_sections * y_sec
def get_subregion_series(data, min_x, min_y, dif_x, dif_y):
X_SECTIONS = 20
Y_SECTIONS = 20
subregion_list = []
for i in range(len(data)):
pos_x = data["X"][i]
pos_y = data["Y"][i]
subregion = get_subregion(pos_x, pos_y, min_x, min_y, dif_x, dif_y, X_SECTIONS, Y_SECTIONS)
subregion_list.append(subregion)
return pd.Series(subregion_list, name='subregion')
In [17]:
subregion_series = get_subregion_series(train_data, min_x, min_y, dif_x, dif_y)
In [19]:
# look at the numer of crimes in each subregion
subregion_series.value_counts()
Out[19]:
In [21]:
# highest crime rate around union square
get_subregion_pos(293, min_x, min_y, dif_x, dif_y, 20, 20)
Out[21]:
In [22]:
# generate one hot encoding of police destricts
one_hot_police_destricts = pd.get_dummies(train_data["PdDistrict"])
In [23]:
one_hot_police_destricts["NORTHERN"]
Out[23]:
In [49]:
regions = subregion_series.unique()
crimes = train_data['Category'].unique()
In [50]:
# count crimes in each region
criminal_activity_local = {}
criminal_activity_overall = train_data["Category"].value_counts()
for r in regions:
criminal_activity_local[r] = {}
criminal_activity_local[r]["N"] = 0
for c in crimes:
criminal_activity_local[r][c] = 0
for i, r in enumerate(subregion_series):
criminal_activity_local[r][train_data["Category"][i]] += 1
criminal_activity_local[r]["N"] += 1
In [51]:
# union square
criminal_activity_local[293]
Out[51]:
In [52]:
# global crime distribution
distribution_global = {}
for c in crimes:
distribution_global[c] = criminal_activity_overall[c] / len(train_data)
for c in distribution_global:
print(c, distribution_global[c])
In [65]:
# local crime distribution
distribution_local = {}
sufficient_n = 500
for r in regions:
distribution_local[r] = {}
for c in crimes:
if criminal_activity_local[r]["N"] >= sufficient_n:
distribution_local[r][c] = criminal_activity_local[r][c] / criminal_activity_local[r]["N"]
else:
distribution_local[r][c] = distribution_global[c]
In [72]:
# crime distribution at union square
print(distribution_local[293])
In [70]:
sum(distribution_local[293]
Out[70]:
In [ ]: