In [1]:
# Imports for data manipulation
import pandas as pd
import numpy as np
import pickle
In [2]:
# Importa for data visualization
from matplotlib import pyplot as plt
import seaborn as sns
In [3]:
def plotCrimeCounts(data, features, city):
'''
Data is the dataframe containing the crime statistics.
Features is disctionary mapping {description: column}
city is the name of the city for which we are plotting.
'''
for description, columnName in features.iteritems():
tmp = data.sort_values(by=columnName)
sns.countplot(y=columnName, data=data)
plt.title('Crimes in {} by {}'.format(city, description))
plt.show()
In [35]:
# Helper functions to help clean SF data. We do this only once, as we
# now save the results using pickle!
def bucket(series, n):
# Takes a series and returns a series mapping each element to a
# one of n buckets.
mi, ma = series.min(), series.max()
buckets = np.linspace(mi, ma, n + 1)
res = np.zeros(len(series))
array = series.values
if np.isnan(array).any():
print "Error! A total of {} nans!".format(np.isnan(array).sum())
for i in xrange(n):
res[(buckets[i] <= array) & (array < buckets[i+1])] = i
return res
def cleanColumns(data):
# Used to rename the columns in our data grame to their appropriate names.
# Also drops unnecessary columns.
data['Latitude'] = data['Y']
data['Longitude'] = data['X']
data['Type'] = data['Category']
# print data.columns
data = data.drop(['IncidntNum', 'Descript','Resolution','Address','X','Y', 'Location'], axis=1)
return data
def createPartitions(data, n):
# Remove outliers from the latitude/longitude issues
# We know that the city lies between -130, -120 longitude
# We also know that the citiy lies between 37 and 40 degrees latitude
data = data[-120 > data.Longitude][data.Longitude > (-130)]
data = data[data.Latitude > 37][data.Latitude < 40]
# Each row is an occurrance of a single crime.
# Keep around the original data
# data['Region'] = n * buckets(data['Latitude'], n) + buckets(data['Longitude'],n) + 1
data['xRegion'] = bucket(data['Latitude'], n)
data['yRegion'] = bucket(data['Longitude'],n)
data['Region'] = n * data.xRegion + data.yRegion
# Add in the types into the results.
mapping = {key: i for i,key in enumerate(data['Type'].unique())}
data['TypeIndex'] = data['Type'].map(mapping)
# Now we can add the crime counts.
data['CrimeCount'] = np.ones(len(data))
return data
def extractDateFeatures(data):
# Creates a new data frame and returns it as copy with all the data that we're interested in
# Create map from week days to integers
DayOfWeek = {'Sunday': 1,
'Monday': 2,
'Tuesday': 3,
'Wednesday': 4,
'Thursday': 5,
'Friday': 6,
'Saturday': 7 }
data['DoW'] = data['DayOfWeek'].map(DayOfWeek)
data = data.drop('DayOfWeek', axis=1)
print "Created Weeks"
# We assume that the Date column is already in datetime format
data['Month'] = data.Date.map(lambda x: x.month)
data['DoM'] = data.Date.map(lambda x: x.day)
data['Year'] = data.Date.map(lambda x: x.year)
data['ToD'] = data.Time.map(lambda x: x.minute)
data['Time'] = data.Time.map(lambda x: x.value / 10**9) - data.Date.min().value / 10**9
# We add an additional column that combines the month and the year into number of months since beginning
min_year = data.Year.min()
data['TimeFeature'] = data.ix[:, ['Year', 'Month']].apply(lambda s: 12*(s[0]-min_year) + s[1], axis=1)
data = data.drop('Date', axis=1)
print "Created the time data features!"
return data
In [26]:
# Make histograms of the crimes by month for each given year
def histograms(data, city, directory):
years = data.Year.unique()
plt.clf()
for year in years:
# Subset the data
tmp = data[data.Year == year]
# Make Histogram
sns.countplot(x = 'Month', data=tmp)
plt.title('{} Crime Histogram for {}'.format(city, year))
plt.savefig('../figures/{}/crimes_monthly_{}'.format(
directory, year))
plt.close()
In [6]:
# Let's do one really large heatmap
def createHeatmapData(data, size = 30):
# data is what we're making a heatmap on.
# Using the San Francisco data set, we create an nxn matrix
# which counts the number of crimes in a given area!
res = np.zeros((size, size))
for i in range(size):
for j in range(size):
res[i,j] = len(data[(data.xRegion == i) & (data.yRegion == j)])
return res
In [27]:
# Let's redo the heatmap year by year?
def yearByYear(data, folder, city):
# Find unique years!
years = data.Year.unique()
plt.clf() # Clear figure before plotting.
for year in years:
# Deal with data in SF set being zero indexed
tmp = data[data.Year == year]
# Only plot if there's data:
if len(tmp) > 0:
sns.heatmap(createHeatmapData(tmp))
plt.title('Crime Distribution in {} for {}'.format(city, year))
plt.savefig('../figures/{}/heat_map_{}'.format(folder, year))
plt.close()
In [28]:
# Let's redo it month by month because that's what we want to analyze?
def monthByMonth(data, folder, city):
years = data.Year.unique()
months = data.Month.unique()
plt.clf() # Clear figure before plotting
for year in years:
for month in months:
tmp = data[(data.Year == year) & (data.Month == month)]
# Only plot if there is data to be plotted
if len(tmp) > 0:
sns.heatmap(createHeatmapData(tmp))
plt.title('Crime Distribution in {} for {}, {}'.format(
city, month, year))
plt.savefig('../figures/{}/heat_map_{}_{}'.format(
folder, month, year))
plt.close()
In [34]:
# Now we write a function to partition the data
def createSimplePartitions(data, n):
# Returns a partitioned version of the bosclean data!
data['xRegion'] = bucket(data.Latitude, n)
data['yRegion'] = bucket(data.Longitude, n)
data['Region'] = n * data.xRegion + data.yRegion
return data
In [10]:
sfdata_file = '../data/sf.csv'
sfdata = pd.read_csv(sfdata_file)
In [11]:
# Create histograms on some important metrics
# Map from english_description : column name
columnsToPlotSF = { 'Police District' : 'PdDistrict',
'Crime Type' : 'Category',
'Day of the Week' : 'DayOfWeek',
'Crime Outcome' : 'Resolution' }
In [12]:
plotCrimeCounts(sfdata, columnsToPlotSF, 'San Francisco')
In [13]:
# Read bosdata
import re
import warnings
bos_file = '../data/boston.csv'
target_type = str # The desired output type
with warnings.catch_warnings(record=True) as ws:
warnings.simplefilter("always")
bosData = pd.read_csv(bos_file, sep=",", header=0)
print("Warnings raised:", ws)
# We have an error on specific columns, try and load them as string
for w in ws:
s = str(w.message)
print("Warning message:", s)
match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
if match:
columns = match.group(1).split(',') # Get columns as a list
columns = [int(c) for c in columns]
print("Applying %s dtype to columns:" % target_type, columns)
bosData.iloc[:,columns] = bosData.iloc[:,columns].astype(target_type)
In [14]:
# Create histograms on some important metrics for Boston
columnsToPlotBos = { 'Reporting District' : 'REPTDISTRICT',
'Weapon Type' : 'WEAPONTYPE',
'Shooting/No Shooting' : 'Shooting',
'Officer Shift' : 'SHIFT',
'Year' : 'Year',
'Month' : 'Month',
'Day of the Week' : 'DAY_WEEK'}
In [15]:
plotCrimeCounts(bosData, columnsToPlotBos, 'Boston')
In [16]:
loadPickle = True
In [17]:
# Convert date to actual date format. This might take a while!
# Note that we should not have to do this once the data
# has been cleaned!
# SKIP IF CLEAN DATA EXISTS
if not loadPickle:
sfdata.Date = sfdata['Date'].apply(lambda x: pd.to_datetime(x, errors='raise'))
sfdata.Time = sfdata['Time'].apply(lambda x: pd.to_datetime(x, errors='raise'))
In [18]:
sfpickle_file = '../../cs281_data/large_data/sfclean.pk'
In [19]:
# We only need to do this once, afterwards we should load from the
# saved location.
# DO NOT RUN IF PICKLED FILE ALREADY EXISTS
if not loadPickle:
sfclean = cleanColumns(sfdata)
sfclean = extractDateFeatures(sfclean)
sfclean.to_pickle(sfpickle_file)
In [20]:
# Load sfclean from pickled location!
sfclean = open(sfpickle_file)
sfclean = pickle.load(sfclean)
In [21]:
# Sort the data by time
sfclean = sfclean.sort_values(by='TimeFeature')
In [22]:
# Save memory by deleting old data
del(sfdata)
# del(sfclean_data)
In [23]:
# Generate some additional histograms
columnsToPlotSF2 = { 'Month' : 'Month',
'Day of Month' : 'DoM',
'Year' : 'Year',
'Hour of Day' : 'ToD' }
In [24]:
plotCrimeCounts(sfclean, columnsToPlotSF2, 'San Franscisco')
In [29]:
histograms(sfclean, 'San Francisco', 'sf_data_analysis')
In [36]:
# Let's generate some heatmaps for both of these data crimes
# Can we overlay these on top of the geographical location???
n = 30
SFByRegion = createPartitions(sfclean, n)
In [39]:
sns.heatmap(createHeatmapData(SFByRegion))
plt.title('Crime Distribution in San Francisco')
plt.show()
In [ ]:
yearByYear(SFByRegion, 'sf_data_analysis', 'San Francisco')
In [ ]:
monthByMonth(SFByRegion, 'sf_data_analysis', 'San Francisco')
In [40]:
# We now process the Boston Data: Again, we pickle the results.
loadBosPickle = True
bos_pickle_file = '../../cs281_data/large_data/bosclean.pk'
In [41]:
# Let's process the boston data
if not loadBosPickle:
# Clean the columns
bosData['Latitude'] = bosData['X']
bosData['Longitude'] = bosData['Y']
# Drop unused columns
toDrop = ['X', 'Y']
bosData = bosData.drop(toDrop, axis=1)
# Extract date features
# day of week
day = np.array(bosData.DAY_WEEK)
day[ day == "Sunday"] = 0
day[ day == "Monday"] = 1
day[ day == "Tuesday"] = 2
day[ day == "Wednesday"] = 3
day[ day == "Thursday"] = 4
day[ day == "Friday"] = 5
day[ day == "Saturday"] = 6
date_time = np.array([x.split() for x in bosData.FROMDATE])
date = date_time[:,0]
time = date_time[:,1]
tod = date_time[:,2]
# month, day, year
date = np.array([x.split('/') for x in date])
month = [int(x) for x in date[:,0]]
dom = [int(x) for x in date[:,1]]
year = [int(x) for x in date[:,2]]
min_year = np.min(year)
min_month = np.min(month)
time_feat = np.subtract(year, min_year)*12 + np.subtract(month,min_month)
# time of day
time_c = [x.split(':') for x in time]
time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1])
if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1])
for x,y in zip(time_c, tod)]
# Add them back to the data frame
bosData['Day'] = day
del day
bosData['Month'] = month
del month
bosData['Dom'] = dom
del dom
bosData['Year'] = year
del year
bosData['Time'] = time
del time
bosData['TimeFeature'] = time_feat
del time_feat
In [42]:
# Drop some more unnecessary columns
if not loadBosPickle:
toDrop = ['COMPNOS', 'NatureCode', 'Location', 'XSTREETNAME', 'STREETNAME']
toDrop += ['INCIDENT_TYPE_DESCRIPTION', 'MAIN_CRIMECODE', 'REPTDISTRICT']
toDrop += ['REPORTINGAREA', 'FROMDATE', 'WEAPONTYPE', 'Shooting', 'DOMESTIC']
toDrop += ['Location']
bosData = bosData.drop(toDrop, axis=1)
print "Records before: {}".format(len(bosData))
bosData = bosData.dropna(axis=0)
print "Records after: {}".format(len(bosData))
In [43]:
# Lets's save the data
if not loadBosPickle:
bosData.to_pickle(bos_pickle_file)
del(bosData)
In [44]:
# Let's load the data
with open(bos_pickle_file) as bosclean_file:
bosclean = pickle.load(bosclean_file)
In [45]:
histograms(bosclean, 'Boston', 'boston_data_analysis')
In [46]:
# Let's generate some heatmaps on the boston crime data
# Can we overlay these over a boston map?
n = 30
regionedData = createSimplePartitions(bosclean, n)
In [47]:
# Let's do a single large heatmap
sns.heatmap(createHeatmapData(regionedData, n))
plt.title('Crime Distribution in Boston')
plt.show()
In [ ]:
yearByYear(regionedData, 'boston_data_analysis', 'Boston')
In [ ]:
monthByMonth(regionedData, 'boston_data_analysis', 'Boston')
In [48]:
# Read in the Chicago data set
import re
import warnings
years = 2000 + np.linspace(1,15, 15)
buckets = np.zeros(1)
target_type = str # The desired output type
chicagoData = pd.DataFrame()
for year in years:
data_file = '../../cs281_data/large_data/chicago/%d.0.csv' % year
with warnings.catch_warnings(record=True) as ws:
warnings.simplefilter("always")
data = pd.read_csv(data_file, sep=",", header=None)
print("Warnings raised:", ws)
# We have an error on specific columns, try and load them as string
for w in ws:
s = str(w.message)
print("Warning message:", s)
match = re.search(r"Columns \(([0-9,]+)\) have mixed types\.", s)
if match:
columns = match.group(1).split(',') # Get columns as a list
columns = [int(c) for c in columns]
print("Applying %s dtype to columns:" % target_type, columns)
data.iloc[:,columns] = data.iloc[:,columns].astype(target_type)
chicagoData = chicagoData.append(data,ignore_index=True)
print len(chicagoData)
In [49]:
# Extract Features
chicagoData.columns = ['Date', 'Latitude', 'Longitude']
pickledChicago = True
chicago_pickle_file = '../../cs281_data/large_data/chicago.pkl'
if not pickledChicago:
date_time = np.array([x.split() for x in chicagoData.Date])
date = date_time[:,0]
time = date_time[:,1]
tod = date_time[:,2]
# month, day, year
date = np.array([x.split('/') for x in date])
month = [int(x) for x in date[:,0]]
dom = [int(x) for x in date[:,1]]
year = [int(x) for x in date[:,2]]
min_year = np.min(year)
min_month = np.min(month)
time_feat = np.subtract(year, min_year)*12 + np.subtract(month,min_month)
time_c = [x.split(':') for x in time]
time = [int(x[1]) if (y == 'AM' and int(x[0]) == 12) else 60*int(x[0])+int(x[1])
if (y =='AM' and int(x[0]) != 12) or (int(x[0]) == 12 and y == 'PM') else 12*60+60*int(x[0])+int(x[1])
for x,y in zip(time_c, tod)]
chicagoData['Month'] = month
del month
chicagoData['DoM'] = dom
del dom
chicagoData['Year'] = year
del year
chicagoData['ToD'] = tod
del tod
chicagoData['Time'] = time
del time
chicagoData['TimeFeature'] = time_feat
del time_feat
# Sort the results
chicagoData = chicagoData.sort_values(by='TimeFeature')
chicagoData = chicagoData.dropna()
# Save the results to a pickle file
chicagoData.to_pickle(chicago_pickle_file)
print "Saved to Pickle File"
In [75]:
# Let's load the data
with open(chicago_pickle_file) as f:
%time
chicagoClean = pickle.load(f)
del chicagoData
In [51]:
# Create heatmaps and histograms if possible
columnsToPlotChicago = { 'Year' : 'Year',
'Month' : 'Month',
'Day of Month' : 'DoM',
'Hour of Day' : 'ToD'}
plotCrimeCounts(chicagoClean, columnsToPlotChicago, 'Chicago')
In [52]:
# Make histograms of the crime
histograms(chicagoClean, 'Chicago', 'chicago_data_analysis')
In [92]:
# Partition into n = 30
n = 30
%time
chicagoPartitions = createSimplePartitions(chicagoClean, n)
In [93]:
# Heatmap of the entire Chicago region!
%time
sns.heatmap(createHeatmapData(chicagoPartitions, n))
plt.title('Crime Distribution in Chicago')
plt.show()
In [94]:
%%timeit
yearByYear(chicagoPartitions, 'chicago_data_analysis', 'Chicago')
In [95]:
%%timeit
monthByMonth(chicagoPartitions, 'chicago_data_analysis', 'Chicago')
In [ ]:
'''
THE BELOW HAS BEEN MOVED TO BASELINES.ipython
'''
# Calculate averages for each region based on the training set
# Last year
last_year = sfclean.Year.max()
training = sfclean[sfclean.Year != last_year]
test = sfclean[sfclean.Year == last_year]
# Partition the data
averageTrainingData = createPartitions(training, 10)
In [ ]:
years = averageTrainingData.Year.unique()
months = averageTrainingData.Month.unique()
regions = averageTrainingData.Region.unique()
# Calculate how many months of data we have
nmonths = 0
for year in years:
nmonths += len(averageTrainingData[averageTrainingData.Year == year].Month.unique())
In [ ]:
nmonths
In [ ]:
# Now calculat ethe predictions!
predictions = {}
for region in regions:
data = partitionedData[partitionedData.Region == region]
crimes = len(data)
predictions[region] = crimes / float(nmonths)
In [ ]:
# Now calculate the rmse
testAverage = createPartitions(test, 10)
In [ ]:
# Let's create a dictionary with the counts for region,month,year
from collections import defaultdict
crimeCounts = defaultdict(int)
for i, row in testAverage.iterrows():
crimeCounts[(row.Region, row.Year, row.Month)] += 1
In [ ]:
# Now calculate the RMSE of this function:
error = []
for region in testAverage.Region.unique():
data = testAverage[testAverage.Region == region]
for year in data.Year.unique():
data2 = data[data.Year == year]
for month in data2.Month.unique():
try:
# print crimeCounts[(region, year, month)]
# print predictions[region]
error.append(crimeCounts[(region, year, month)] -
predictions[region])
except KeyError:
print region
error = np.array(error)
rmse = np.sqrt(np.sum(error ** 2) / len(error))
In [ ]:
rmse
In [ ]:
np.std(crimeCounts.values())
In [ ]:
np.max(crimeCounts.values())
In [ ]: