In [1]:
# Load data from sf
import pandas as pd
import numpy as np
from sklearn import linear_model
from matplotlib import pyplot as plt
In [2]:
import seaborn as sns
In [4]:
sfdata = pd.read_csv("../data/sf.csv")
In [5]:
len(sfdata.PdDistrict.unique())
Out[5]:
In [6]:
sfdata.PdDistrict.unique()
Out[6]:
In [7]:
test = sfdata
test['CrimeCount'] = np.ones(len(sfdata))
In [8]:
test = sfdata.groupby('PdDistrict')
aggregate = test.aggregate(np.sum)
In [11]:
sns.countplot(x="PdDistrict", data=sfdata)
Out[11]:
In [12]:
plt.show()
In [14]:
# Convert date to actual date format. This might take a while!
sfdata.Date = sfdata['Date'].apply(lambda x: pd.to_datetime(x, errors='raise'))
sfdata.Time = sfdata['Time'].apply(lambda x: pd.to_datetime(x, errors='raise'))
In [ ]:
def buckets(series, n):
# Takes a series and returns a series mapping each element to a
# one of n buckets.
mi, ma = series.min(), series.max()
buckets = np.linspace(mi, ma, n)
print buckets
def f(e):
for i, el in enumerate(buckets):
if e < el:
return i
return n - 1
res = series.map(f)
return res
def cleanColumns(data):
# Used to rename the columns in our data grame to their appropriate names.
# Also drops unnecessary columns.
data['Latitude'] = data['Y']
data['Longitude'] = data['X']
data['Type'] = data['Category']
print data.columns
data = data.drop(['IncidntNum', 'Category', 'Descript', 'PdDistrict','Resolution','Address','X','Y', 'Location'], axis=1)
return data
def createPartitions(data, n):
# Remove outliers from the latitude/longitude issues
# We know that the city lies between -130, -120 longitude
# We also know that the citiy lies between 37 and 40 degrees latitude
data = data[-120 > data.Longitude][data.Longitude > (-130)]
data = data[data.Latitude > 37][data.Latitude < 40]
# Each row is an occurrance of a single crime.
# Keep around the original data
data['Region'] = n * buckets(data['Latitude'], n) + buckets(data['Longitude'],n) + 1
# Add in the types into the results.
mapping = {key: i for i,key in enumerate(data['Type'].unique())}
data['TypeIndex'] = data['Type'].map(mapping)
# Now we can add the crime counts.
data['CrimeCount'] = np.ones(len(data))
return data
def extractDateFeatures(data):
# Creates a new data frame and returns it as copy with all the data that we're interested in
# Create map from week days to integers
DayOfWeek = {'Sunday': 1,
'Monday': 2,
'Tuesday': 3,
'Wednesday': 4,
'Thursday': 5,
'Friday': 6,
'Saturday': 7 }
data['DoW'] = data['DayOfWeek'].map(DayOfWeek)
data = data.drop('DayOfWeek', axis=1)
print "Created Weeks"
# We assume that the Date column is already in datetime format
data['Month'] = data.Date.map(lambda x: x.month)
data['DoM'] = data.Date.map(lambda x: x.day)
data['Year'] = data.Date.map(lambda x: x.year) - data.Date.min().year
data['ToD'] = data.Time.map(lambda x: x.minute)
data['Time'] = data.Time.map(lambda x: x.value / 10**9) - data.Date.min().value / 10**9
# We add an additional column that combines the month and the year into number of months since beginning
data['TimeFeature'] = data.ix[:, ['Year', 'Month']].apply(lambda s: 12*s[0] + s[1], axis=1)
data = data.drop('Date', axis=1)
print "Created the time data features!"
return data
def extractDataFeatures(data, n):
# Given the input data read directly from the exported data
# (https://data.sfgov.org/Public-Safety/Map-Crime-Incidents-from-1-Jan-2003/gxxq-x39z)
# We convert it into the format specified as follows:
# We want the results to be a data frame with the following columns.:
# -> Latitude (float)
# -> Longtitude (float)
# -> Region (specifies which region this data point belongs to)
# -> DoW (1-7 results) (Day of the Week)
# -> Month (1-12) (Month of the Year)
# -> DoM (1-31) (Day of the Month)
# -> Year (0->max(year) - min(year))
# -> ToD (Time of Day, specified as number of minutes since start of day)
# -> Time (int) : #minutes since earliest sample in the data set
# -> Type (string) : Described the type of crime
# -> TypeIndex (int): Index mapping uniquely each crime type to a value in [1...#crime types]
# -> CrimeCount (int) : The number of crimes in this area
# Remove unnecessary columns and rename
cData = cleanColumns(data)
# Created partitions. Note that this modifies the data by adding a REGION column.
partitionedData = createPartitions(cData, n)
# Now we convert the data to the correct dates and clean the data!
finalData = extractDateFeatures(partitionedData)
# Return the results
return finalData
In [ ]:
def countCrime(d, region):
'''
Counts the crime in a given region, returning an array of size 144 with halucinated empty data
for non-existent crime periods.
'''
res = np.zeros(144)
for i in range(144):
try:
# print d.ix[region, i].CrimeCount
res[i] = d.ix[region, i].CrimeCount
except (KeyError, AttributeError, IndexError) as e:
pass
return res
In [ ]:
d = train.groupby(['Region', 'TimeFeature']).aggregate(np.sum)
In [ ]:
d.ix[1]
In [ ]:
def splitTrainTest(D, year=None):
'''
Given a data frame with the specified data, we split into a training set and a test set.
The test data consists of us holding out a particular year from our training data.
'''
# We only want to keep some of the data
D = D.ix[:, ['Year', 'TimeFeature', 'CrimeCount', 'Region']]
# First, we're going to seperate by region.
if year is None:
test = D[D['Year'] == D['Year'].max()]
train = D[D['Year'] != D['Year'].max()]
else:
test = D[D['Year'] == year]
train = D[D['Year'] != year]
# Now we keep only a subset of the columns we want, which is TimeFeature and CrimeCount
train = train.drop('Year', axis=1)
test = test.drop('Year', axis=1)
print "Finished Creating Testing Set"
# Now we groupby TimeFeature which consists of YearMonth string.
trainD, testD = train.groupby(['Region', 'TimeFeature']).aggregate(np.sum), test.groupby([ 'Region', 'TimeFeature']).aggregate(np.sum)
# return trainD
trainRes = {}
testRes = {}
for region in range(1,D.Region.max()+1):
# Training
trainRes[region] = np.zeros((144,2))
trainRes[region][:,0] = range(144)
# print trainRes[region]
trainRes[region][:,1] = countCrime(trainD, region)
# Test
testRes[region] = np.zeros((144,2))
testRes[region][:,0] = range(144)
testRes[region][:,1] = countCrime(testD, region)
return trainRes, trainRes
In [ ]:
results = extractDataFeatures(sfdata, 10)
In [ ]:
def plotHistogram(results, n):
fig, ax = plt.subplots( nrows=1, ncols=1 )
ax.hist(results.Region, bins=range(n*n))
plt.xlabel("San Francisco Region")
plt.ylabel("Total Incidents of Reported Incidents")
plt.title("Crime Incidents in San Francisco when Divided into {}x{} grid.".format(n,n))
plt.savefig("figures/sf_n{}".format(n))
plt.close(fig)
In [ ]:
plotHistogram(results, 10)
In [ ]:
train, test = splitTrainTest(results)
In [ ]:
def getRMSE(x,y):
return np.sqrt(np.sum((y - x)**2) / len(y))
def trainAndTest(train, test):
# Given a set of training and testing data, train a linear regression model, tests it, and then
# computes the RMSE of each region, returning the resulting dictionary of RMSEs for region, as well as
# a dictionary of predictions, and a dictionary of trained models
models = {}
RMSE = {}
predictions = {}
for region in train:
if region % 10 == 0:
print "Training on region {}".format(region)
model = LinearRegression()
x = train[region][:,0]
x = x.reshape((len(x),1))
y = train[region][:,1]
y = y.reshape((len(y),1))
model.fit(x,y)
xTest = test[region][:,0]
xTest = xTest.reshape((len(xTest),1))
yTest = test[region][:,1]
yTest = yTest.reshape((len(yTest),1))
preds = model.predict(xTest)
rmse = getRMSE(yTest, preds)
# store the results
models[region] = model
RMSE[region] = rmse
predictions[region] = preds
return models, RMSE, predictions
In [ ]:
models, RMSE, predictions = trainAndTest(train, test)
In [ ]:
sum(RMSE.values()) / len(RMSE)
In [ ]:
tRes = createPartitions(results, 1)
In [ ]:
tRes
In [ ]:
test
In [ ]:
# We try different values of n and calculate the average RMSE for each value!
# Note that results already has the data extracted
rmses = []
max_rmses = []
min_rmses = []
for n in range(1,10) + range(10,50,10):
# We only need to extract the data once, which results has already done (for n = 10)
tRes = createPartitions(results, n)
# This saves a plot of the distribution as a histogram
plotHistogram(tRes, n)
# Now we split into train and test
train, test = splitTrainTest(tRes)
# Now we caculate the average RMSE
_, RMSE, _ = trainAndTest(train, test)
print "RMSE: {}".format(sum(RMSE.values()) / len(RMSE))
rmses.append(sum(RMSE.values()) / len(RMSE))
max_rmses.append(max(RMSE.values()))
min_rmses.append(min(RMSE.values()))
In [ ]:
min_rmses
In [ ]:
x =range(1,10) + range(10,50,10)
plt.scatter(x, rmses, color='red')
plt.scatter(x, max_rmses, color='blue')
plt.title("RMSE vs Resolution")
plt.xlabel("Geographic Resolution (n)")
plt.ylabel("Average and Max RMSE")
plt.show()
In [ ]:
zip(x,rmses)
In [ ]: