In [1]:
import dask.dataframe as dd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import seaborn as sns
In [2]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'
# set neat seaborn whitegrid styles for matplotlib charts
plt.style.use('seaborn')
sns.set_style('whitegrid')
In [3]:
%%time
# set parquet data folder path
parquet_data_folder = '../data/crimes-2001-to-present.snappy.parq'
print('Loading crime data from: {}'.format(parquet_data_folder))
# load crimes parquet data into dask df
crimes = dd.read_parquet(parquet_data_folder, index='Date')
# load all data into memory
crimes = crimes.persist()
print('Crime data loaded into memory.')
In [4]:
%%time
# log records count and data frame/partitions stats
print('Chicago Crime data stats:')
print('---------------------------------------')
print('{:,} total records in {} partitions'.format(len(crimes), crimes.npartitions))
print('DataFrame size: {:,}'.format(crimes.size.compute()))
In [5]:
# check data frame structure
crimes
Out[5]:
In [6]:
%%time
# preview crimes data with Date index
crimes.head(5)
Out[6]:
In [7]:
%%time
# get crime types
crime_types = crimes[['PrimaryType']]
# get crime counts by primary type
crime_type_total = crime_types.groupby('PrimaryType').size().compute()
# get arrests and domestic counts for each primary crime type
crime_type_counts = crimes[['PrimaryType', 'Arrest', 'Domestic']]\
.groupby('PrimaryType').sum().compute()
# add crime type totals column
crime_type_counts['Total'] = crime_type_total
# print all crime stats (arrests, domestic, and totals)
print(crime_type_counts.sort_values(by='Total', ascending=False))
In [8]:
# plot crimes by primary type counts
all_crime_types = crime_type_counts.sort_values(by='Total', ascending=True)\
.drop(['Arrest', 'Domestic'], axis=1) # denotes column
all_crime_types.plot(kind='barh', figsize=(10,8), color='#cc0000')
plt.title('All Chicago Crimes by Type (2001-2017)')
plt.xlabel('Number of Crime reports')
plt.ylabel('Crime Type')
plt.tight_layout()
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [9]:
# plot top crimes (>-= 200,000 reports)
crime_type_counts[crime_type_counts['Total'] >= 200000].sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(10,8))
plt.title('Top Chicago Crimes (>= 200K reports) - (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Crime Type')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [10]:
# get narcotics crimes
narcotics = crimes[crimes['PrimaryType']=='NARCOTICS']
#print(narcotics.head())
# get narcotics crime description counts
narcotics_crimes = narcotics[['PrimaryType', 'Description']]\
.groupby('Description').count().compute()\
.sort_values(by='PrimaryType', ascending=False)
print(narcotics_crimes.head(10))
print('...')
print('Total Narcotics Crime Descriptions: {}'.format(len(narcotics_crimes)))
In [11]:
# plot top 20 narcotics crimes
narcotics_crimes[:20].sort_values(by='PrimaryType', ascending=True)\
.plot(kind='barh', figsize=(6,6), color='#cc0000')
plt.title('Top Chicago Narcotics Crimes (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Narcotics Crime')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [12]:
# get other offenses crimes
other_offenses = crimes[crimes['PrimaryType']=='OTHER OFFENSE']
#print(other_offenses.head())
# get other offense crime description counts
other_offense_crimes = other_offenses[['PrimaryType', 'Description']]\
.groupby('Description').count().compute()\
.sort_values(by='PrimaryType', ascending=False)
print(other_offense_crimes.head(10))
print('...')
print('Total Other Offense Crime Descriptions: {}'.format(len(other_offense_crimes)))
In [13]:
# plot top 20 other offense crimes
other_offense_crimes[:20].sort_values(by='PrimaryType', ascending=True)\
.plot(kind='barh', figsize=(6,6), color='#cc0000')
plt.title('Top Chicago Other Offense Crimes (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Other Offense Crime')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [14]:
# plot crimes per day of the week
days = ['Monday','Tuesday','Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']
crimes.groupby([crimes.index.dayofweek]).size().compute()\
.plot(kind='barh', figsize=(6,3), color='#cc0000')
plt.title('All Chicago Crimes by Day of the Week (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Week Day')
plt.yticks(np.arange(7), days)
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [15]:
# plot crimes per month of the year
crimes.groupby([crimes.index.month]).size().compute()\
.plot(kind='barh', figsize=(8,5), color='#cc0000')
plt.title('All Chicago Crimes by Month of the Year (2001-2017)')
plt.xlabel('Number of crimes')
plt.ylabel('Month of the year')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [16]:
%%time
print('\nParsing All Crimes...')
# get monthly total crime counts
monthly_crime = crime_types.resample('M').count().compute()
# print monthly total crime stats
print('---------------------------------------')
print(monthly_crime.head())
print('...\nTotal Crime records: {:,}'.format(len(crimes)))
print('Total Months: {:,}'.format(monthly_crime.size))
# dataframe stats print util function
def print_dataframe_stats(df, title):
print('\n{}\n---------------------------------------'.format(title))
print(df.head())
print('...\nTotal {}: {:,}'.format(title, df.size.compute()))
# get arrests
arrests = crimes[crimes['Arrest'] == True]['Arrest']
print_dataframe_stats(arrests, 'Arrests')
# get domestic crimes
domestic = crimes[crimes['Domestic'] == True]['Domestic']
print_dataframe_stats(domestic, 'Domestic')
In [17]:
# plot monthly arrests
monthly_arrests = arrests.resample('M').sum().compute()
monthly_arrests.plot(figsize=(10,6), color='#3399ff')
plt.title('Monthly Chicago Arrests (2001-2017)')
plt.xlabel('Months')
plt.ylabel('Arrests')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [18]:
# get monthly domestic crimes
monthly_domestic = domestic.resample('M').sum().compute()
# plot monthly crime counts, arrests, and domestic incidents reports
fig, ax = plt.subplots(figsize=(10,6))
ax.plot(monthly_crime.index, monthly_crime, '--', label='Total', color='#cc0000')
ax.plot(monthly_arrests.index, monthly_arrests, color='#3399ff', zorder=10)
ax.fill_between(monthly_domestic.index, monthly_domestic, \
label='Domestic', color='c')
ax.set_xlabel('Months')
ax.set_ylabel('Number of Crimes')
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
ax.legend(loc='upper right')
plt.title('Monthly Chicago Crime reports, Arrests, and Domestic incidents (2001-2017)')
plt.show()
In [19]:
%%time
# get crimes rolling sum for a smoother crime decline chart display
crimes_rolling_sum = crime_types.resample('D').count().rolling(365).sum().compute()
crimes_rolling_sum.plot(figsize=(10,6), color='#cc0000')
plt.title('Rolling sum of all Chicago crimes over 365 days period (2001-2017)')
plt.xlabel('Days')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [20]:
# get top crime types
top_crime_types = crime_type_counts[crime_type_counts['Total'] >= 200000]\
.sort_values(by='Total', ascending=False)
top_crime_types.head(10)
Out[20]:
In [21]:
%%time
print('Getting Top Crimes by Month...')
# get top crimes stats
# TODO: use isin and index by primary type to speed up this query?
top_crimes = crime_types[(crime_types['PrimaryType']=='THEFT') | \
(crime_types['PrimaryType']=='BATTERY') | \
(crime_types['PrimaryType']=='CRIMINAL DAMAGE') | \
(crime_types['PrimaryType']=='NARCOTICS') | \
(crime_types['PrimaryType']=='OTHER OFFENSE') | \
(crime_types['PrimaryType']=='ASSAULT') | \
(crime_types['PrimaryType']=='BURGLARY') | \
(crime_types['PrimaryType']=='MOTOR VEHICLE THEFT') | \
(crime_types['PrimaryType']=='ROBBERY') | \
(crime_types['PrimaryType']=='DECEPTIVE PRACTICE')]
#print(top_crimes.head(10))
# group top crimes by month and crime type
crimes_groupby = top_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
top_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(top_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
top_crime_data = top_crime_data.unstack()
print(top_crime_data.head(2))
print('...')
In [22]:
# plot top crimes data by month
top_crime_data.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Top Chicago Crimes by Month')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [23]:
# plot top crimes grid
top_crime_data.plot(figsize=(12, 12), subplots=True, layout=(-1, 3),\
cmap='tab10', sharex=False, sharey=False)
plt.show()
In [24]:
%%time
print('Getting Property Crimes by Month...')
# get top crimes stats
property_crimes = crime_types[(crime_types['PrimaryType']=='ARSON') | \
(crime_types['PrimaryType']=='BURGLARY') | \
(crime_types['PrimaryType']=='MOTOR VEHICLE THEFT') | \
(crime_types['PrimaryType']=='THEFT')]
#print(property_crimes.head(10))
# group property crimes by month and crime type
crimes_groupby = property_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
property_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(property_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
property_crime_data = property_crime_data.unstack()
print(property_crime_data.head(2))
print('...')
In [25]:
# plot property crimes data by month
property_crime_data.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Chicago Property Crimes by Month')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [26]:
# plot property crimes grid
property_crime_data.plot(figsize=(12, 6), subplots=True, layout=(-1, 3),\
cmap='tab10', sharex=False, sharey=False)
plt.show()
In [27]:
%%time
print('Getting Quality of Life Crimes by Month...')
# get top crimes stats
q_of_life_crimes = crime_types[(crime_types['PrimaryType']=='CRIMINAL DAMAGE') | \
(crime_types['PrimaryType']=='NARCOTICS') | \
(crime_types['PrimaryType']=='PROSTITUTION')]
#print(q_of_life_crimes.head(10))
# group quality of life crimes by month and crime type
crimes_groupby = q_of_life_crimes.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
q_of_life_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(q_of_life_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
q_of_life_crime_data = q_of_life_crime_data.unstack()
print(q_of_life_crime_data.head(2))
print('...')
In [28]:
# plot quality of life crimes data by month
q_of_life_crime_data.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Chicago Quality of Life Crimes by Month')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [29]:
# plot quality of life crimes grid
q_of_life_crime_data.plot(figsize=(12, 3), subplots=True, layout=(-1, 3),\
cmap='tab10', sharex=False, sharey=False)
plt.show()
In [30]:
%%time
# get all crime data by month and primary type
crimes_groupby = crime_types.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
all_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print(all_crime_data.head(10))
print('...')
# unstack all crimes data for series plotting
all_crime_data = all_crime_data.unstack()
print(all_crime_data.head(2))
In [31]:
%%time
# plot all crimes over time individually
all_crime_data.plot(figsize=(12, 36), subplots=True, layout=(-1, 3),\
sharex=False, sharey=False)
plt.show()
In [32]:
# get crime location counts
crime_locations = crimes.groupby('LocationDescription').size().compute()
crime_locations = crime_locations.sort_values(ascending=False).rename('Total') #.reset_index()
# print crime location stats
print(crime_locations.head())
print("...\nTotal Locations: {:,}".format(crime_locations.size))
In [33]:
# plot top 30 crime locations
crime_locations[:30].sort_values(ascending=True)\
.plot(kind='barh', figsize=(10,8), color='#cc0000')
plt.title('Top 30 Chicago Crime Locations (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Location')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [34]:
# load Chicago community areas with pandas
# for plotting crime by Chicago 'sides'
community_areas = pd.read_csv('../data/chicago-community-areas.csv') #, index_col='CommunityName')
community_areas.head()
# get community crime stats
community_areas['Total'] = crimes.groupby('CommunityArea').size().compute().rename('Total')
community_crime = community_areas.sort_values(by='Total', ascending=False).dropna()
# print community crime stats
print(community_crime.head())
print("...\nTotal Communities: {:,}".format(community_crime.Total.count()))
In [35]:
# drop unused columns and reindex
community_crime = community_crime.drop(['CommunityArea', 'Side'], axis=1) # denotes column
community_crime = community_crime.set_index('CommunityName')
community_crime.head()
Out[35]:
In [36]:
# plot 20 high crime communities
community_crime.head(20).sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(10,8), color='#cc0000')
plt.title('High Crime Chicago Communities (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Community')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [37]:
# get Austin community crime stats to see why it is so high
Austin_crime = crimes[crimes['CommunityArea']==25]\
.drop(['CommunityArea', 'Year'], axis=1) # denotes column
# get Austin crimes by block
Austin_crime_blocks = Austin_crime[['Block']]\
.groupby('Block').size().compute()\
.sort_values(ascending=False)
print('Top Austin community crime blocks:')
print(Austin_crime_blocks.head(20))
print('...\nTotal blocks: {}'.format(len(Austin_crime_blocks)))
In [38]:
# get Austin crimes by type
Austin_crime_by_type = Austin_crime[['PrimaryType']]\
.groupby('PrimaryType').size().compute().rename('Total')\
.sort_values(ascending=False)
print('Austin community crimes by type:')
print(Austin_crime_by_type)
print('\nTop Austin community crimes:')
print(Austin_crime_by_type.index[:24])
# group top Austin crimes by month and crime type
crimes_groupby = Austin_crime[Austin_crime['PrimaryType'].isin(Austin_crime_by_type.index[:24])]\
.groupby([pd.TimeGrouper('M'), 'PrimaryType'])
Austin_crime_data = crimes_groupby['PrimaryType'].count().compute().rename('Count')
print('\nAustin community crime by month:')
print(Austin_crime_data.head(10))
print('...')
# unstack top crimes type group for series plotting
Austin_crime_data = Austin_crime_data.unstack()
Austin_crime_data.tail()
Out[38]:
In [39]:
# plot top Austin crimes over time individually
Austin_crime_data.plot(figsize=(12, 24), subplots=True, layout=(-1, 3),\
sharex=False, sharey=False)
plt.show()
In [40]:
# plot Austin crimes on the rise
Austin_crime_data[['HOMICIDE', 'WEAPONS VIOLATION', 'DECEPTIVE PRACTICE']]\
.plot(figsize=(10,8), cmap='tab10')
plt.legend(loc='upper right', frameon=True)
plt.title('Austin Chicago community Crimes on the Rise')
plt.tight_layout()
plt.xlabel('Months')
plt.ylabel('Number of Crimes')
ax = plt.gca()
ax.yaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [41]:
%%time
# get Austin crime geo data
Austin_crime_geo = Austin_crime[['PrimaryType', 'Latitude', 'Longitude']].dropna()
s1 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='HOMICIDE')].compute()
s2 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='WEAPONS VIOLATION')].compute()
s3 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='KIDNAPPING')].compute()
s4 = Austin_crime_geo[(Austin_crime_geo['PrimaryType']=='THEFT')].compute()
# create Austin crime scatter map plot
fig,ax= plt.subplots(1, 1, figsize=(10,12))
plt.xlim(41.86, 41.925) # (minLat, maxLat)
plt.ylim(-87.808, -87.738) # (minLong, maxLong)
#ax.set_facecolor('black')
# plot thefts, homicides, kidnappings and sexual assaults
ax.scatter(s4['Latitude'], s4['Longitude'],\
s=50, alpha=0.8, color='#66ff99', label='Theft')
ax.scatter(s2['Latitude'], s2['Longitude'],\
s=20, alpha=0.8, color='#ff9933', label='Weapons Violation')
ax.scatter(s3['Latitude'], s3['Longitude'],\
s=20, alpha=0.8, color='#3333ff', label='Kidnapping')
ax.scatter(s1['Latitude'], s1['Longitude'],\
s=20, alpha=0.8, color='#ff0000', label='Homicide')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Austin Chicago Crime Map (2001-2017)')
plt.legend(loc='upper right', frameon=True)
plt.grid(linestyle = '--', linewidth = 0.05)
plt.show()
In [42]:
# plot 20 low crime communities
community_crime.tail(20).sort_values(by='Total', ascending=False)\
.plot(kind='barh', figsize=(10,8), color='#3399ff')
plt.title('Low Crime Chicago Communities (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Community')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [43]:
# group crime totals by Chicago 'sides'
crime_by_side = community_areas.groupby('Side').sum()\
.drop('CommunityArea', axis=1) # denotes column
crime_by_side.sort_values(by='Total', ascending=False)
Out[43]:
In [44]:
# plot crime by Chicago community sides
crime_by_side.sort_values(by='Total', ascending=True)\
.plot(kind='barh', figsize=(6,4), color='#cc0000')
plt.title('Chicago Crime by Sides (2001-2017)')
plt.xlabel('Number of Crimes')
plt.ylabel('Chicago Side')
ax = plt.gca()
ax.xaxis.set_major_formatter(ticker.StrMethodFormatter('{x:,.0f}'))
plt.show()
In [45]:
%%time
# get min/max lat and long
#minLat = crimes['Latitude'].min().compute()
#maxLat = crimes['Latitude'].max().compute()
#minLong = crimes['Longitude'].min().compute()
#maxLong = crimes['Longitude'].max().compute()
# get crime geo data
crime_geo = crimes[['PrimaryType', 'Latitude', 'Longitude']].dropna()
# get homicides
homicides = crime_geo[(crime_geo['PrimaryType']=='HOMICIDE')]\
.compute()
# get kidnappings
kidnappings = crime_geo[(crime_geo['PrimaryType']=='KIDNAPPING')]\
.compute()
# get criminal sexual assaults
sexual_assaults = crime_geo[(crime_geo['PrimaryType']=='CRIM SEXUAL ASSAULT')]\
.compute()
# get thefts
thefts = crime_geo[(crime_geo['PrimaryType']=='THEFT')]\
.compute()
# create scatter map plot
fig,ax= plt.subplots(1, 1, figsize=(10,12))
plt.xlim(41.65, 42.02) # (minLat, maxLat)
plt.ylim(-87.78, -87.53) # (minLong, maxLong)
#ax.set_facecolor('black')
# plot thefts, homicides, kidnappings and sexual assaults
ax.scatter(thefts['Latitude'], thefts['Longitude'],\
s=1, alpha=0.8, color='#66ff99', label='Theft')
ax.scatter(sexual_assaults['Latitude'], sexual_assaults['Longitude'],\
s=3, alpha=0.8, color='#ff9933', label='Criminal Sexual Assault')
ax.scatter(kidnappings['Latitude'], kidnappings['Longitude'],\
s=3, alpha=0.8, color='#3333ff', label='Kidnapping')
ax.scatter(homicides['Latitude'], homicides['Longitude'],\
s=3, alpha=0.8, color='#ff0000', label='Homicide')
plt.xlabel('Latitude')
plt.ylabel('Longitude')
plt.title('Chicago Crime Map (2001-2017)')
plt.legend(loc='upper right', frameon=True)
plt.grid(linestyle = '--', linewidth = 0.05)
plt.show()
In [ ]: