In [1]:
import datashader as ds
import dask.dataframe as dd

import bokeh.models
from bokeh.charts import HeatMap
from bokeh.plotting import (
    figure, show, output_file, output_notebook)
from bokeh.tile_providers import STAMEN_TONER

import holoviews as hv
from holoviews.operation.datashader import datashade
from holoviews.operation.datashader import aggregate
from holoviews.operation import decimate
from holoviews import streams
from holoviews.streams import RangeXY, PlotSize

# geo libs imports
import geopandas as gpd
import geoviews as gv
import cartopy.crs as cartopy_crs

import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import numpy as np 
import pandas as pd
import seaborn as sns

import os

# version imports
from IPython import __version__ as ipython_version
from pandas import __version__ as pandas_version
from bokeh import __version__ as bokeh_version

from IPython.core.display import Markdown


D:\tools\dev\python\Anaconda3-4.2.0\lib\site-packages\odo\backends\pandas.py:94: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))
D:\tools\dev\python\Anaconda3-4.2.0\lib\site-packages\seaborn\apionly.py:6: UserWarning: As seaborn no longer sets a default style on import, the seaborn.apionly module is deprecated. It will be removed in a future version.
  warnings.warn(msg, UserWarning)

In [2]:
Markdown(open('README.md').read())


Out[2]:

Chicago Crimes Notebooks

This repository contains a number of notebooks for exploring Chicago crimes since 2001 to present (August 2017).

csv-data-preview.ipynb

CSV data preview notebook contains raw CSV data preview code for 2017 Chicago crime data, such as number of reported crimes, number of arrests and domestic crime reports, unique column value counts, etc. to get a feel for the crime data structure and potential insights that can be harvested from it.

CSV data in this notebook is loaded with dask. More info on Dask framework here:

http://dask.pydata.org/en/latest/

Dask is just like pandas (most commonly used data munging framework in Python), but more suited to working with large distributed data sets.

crime-plots.ipynb

Crimes plots notebook contains matplotlib charts for 2017 Chicago crime data.

all-chicago-crime-charts.ipynb

This notebook loads large Chicago crimes dataset (~1.4Gb of raw data, ~219Mb compressed) with all crimes data recorded since 2001 to present (August 2017).

A variety of matplotlib charts in this notebook show decline of Chicago crime over time, as well as some crime location data for futher insights.

interactive-chicago-crime-charts.ipynb

Per description this notebook will contain interactive Bokeh plots that will be packaged and deployed to heroku most likely for public Chicago crimes data visualizations preview.

This part is currently in dev with an ETA of live data viz in September, 2017.

This is Not Yet Another Chicago Crimes Story Telling Journal

This collection of notebooks on Chicago crime was put together strictly for code and data visualization demo purpose with dask and open source Python charting libraries.

Therefore, these notebooks do not contain the usual commentary on visualized data insights interleaved with code and charts to avoid introducing any bias in this short exploratory data analysis (EDA) study.

Many factors affect metropolitan area crimes, including weather, employment, education and poverty levels, racial mix too, that I plan to explore later, federal government policy changes over time, local government changes, etc.

I simply wanted to put together a set of notebooks that depict public Chicago crime data over time, and slice and dice it across different data dimensions for display.

You can certainly draw some conclusions from them on your own, depending on your area of interest in this massive crime data set.

Introduction to Bokeh

Flip through these tutorial notebooks for a good intro to Bokeh:

http://nbviewer.jupyter.org/github/bokeh/bokeh-notebooks/blob/master/tutorial/00%20-%20intro.ipynb


In [3]:
print('Required Python libraries:')
print('IPython - %s' % ipython_version)
print('Pandas - %s' % pandas_version)
print('Bokeh - %s' % bokeh_version)


Required Python libraries:
IPython - 5.1.0
Pandas - 0.20.3
Bokeh - 0.12.4

In [4]:
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'

# set neat seaborn whitegrid styles for matplotlib charts
plt.style.use('seaborn')
sns.set_style('whitegrid')

# config holoviews for bokeh charts
hv.extension('bokeh')



In [5]:
%%time

# set parquet data folder path
parquet_data_folder = '../data/crimes-2001-to-present.snappy.parq'
print('Loading crime data from: {}'.format(parquet_data_folder))

# load crimes parquet data into dask df
crimes = dd.read_parquet(parquet_data_folder, index='Date')

# load all data into memory
crimes = crimes.persist()
print('Crime data loaded into memory.')


Loading crime data from: ../data/crimes-2001-to-present.snappy.parq
Crime data loaded into memory.
Wall time: 12.5 s

In [6]:
# get Chicago community areas geo data
areas = gpd.read_file('../data/chicago-community-areas.geojson')

# drop unused columns
areas = areas.drop(['area', 'area_num_1', 'comarea', 'comarea_id',
                    'shape_area', 'shape_len', 'perimeter'], axis=1)

# rename area_numbe and convert it to int type 
# for crime dataframe merge on community area # later
areas = areas.rename(columns={'area_numbe': 'CommunityArea'})
areas['CommunityArea'] = areas['CommunityArea'].astype(np.int64)

print('Chicago Community Areas:')
print(areas.head())
print('...\nTotal Community Areas: {:,}\n...'.format(len(areas)))
areas.info()


WARNING:Fiona:GDAL data files not located, GDAL_DATA not set
WARNING:Fiona:PROJ data files not located, PROJ_LIB not set
Chicago Community Areas:
   CommunityArea        community  \
0             35          DOUGLAS   
1             36          OAKLAND   
2             37      FULLER PARK   
3             38  GRAND BOULEVARD   
4             39          KENWOOD   

                                            geometry  
0  (POLYGON ((-87.60914087617894 41.8446925026539...  
1  (POLYGON ((-87.59215283879394 41.8169293462668...  
2  (POLYGON ((-87.62879823733725 41.8018930336891...  
3  (POLYGON ((-87.6067081256125 41.81681377057218...  
4  (POLYGON ((-87.59215283879394 41.8169293462668...  
...
Total Community Areas: 77
...
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 3 columns):
CommunityArea    77 non-null int64
community        77 non-null object
geometry         77 non-null object
dtypes: int64(1), object(2)
memory usage: 1.9+ KB

In [7]:
# load Chicago community areas with sides info
# for plotting crime by Chicago 'sides'
community_areas = pd.read_csv('../data/chicago-community-areas.csv') #, index_col='CommunityName')
community_areas.head()

# get community crime stats
community_areas['Total'] = crimes.groupby('CommunityArea').size().compute().rename('Total')
community_crime = community_areas.dropna()

# print community crime stats
print('High Chicago Crime Communities:')
print(community_crime.sort_values(by='Total', ascending=False).head())
print('...\nTotal Communities: {:,}\n...'.format(len(community_crime)))
community_crime.info()


High Chicago Crime Communities:
    CommunityArea    CommunityName        Side   Total
25             25           Austin   West Side  370680
8               8  Near North Side     Central  192403
43             43      South Shore  South Side  189126
23             23    Humboldt Park   West Side  183219
24             24        West Town   West Side  171884
...
Total Communities: 77
...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 1 to 77
Data columns (total 4 columns):
CommunityArea    77 non-null int64
CommunityName    77 non-null object
Side             77 non-null object
Total            77 non-null int64
dtypes: int64(2), object(2)
memory usage: 3.0+ KB
D:\tools\dev\python\Anaconda3-4.2.0\lib\site-packages\pandas\core\indexes\category.py:138: RuntimeWarning: Values and categories have different dtypes. Did you mean to use
'Categorical.from_codes(codes, categories)'?
  data = Categorical(data, categories=categories, ordered=ordered)

In [8]:
# merge areas geo dataframe with community crime data for mapping
community_crime_geo_df = areas.merge(community_crime, on='CommunityArea')
community_crime_geo_df.head()


Out[8]:
CommunityArea community geometry CommunityName Side Total
0 35 DOUGLAS (POLYGON ((-87.60914087617894 41.8446925026539... Douglas South Side 64059
1 36 OAKLAND (POLYGON ((-87.59215283879394 41.8169293462668... Oakland South Side 12661
2 37 FULLER PARK (POLYGON ((-87.62879823733725 41.8018930336891... Fuller Park South Side 19594
3 38 GRAND BOULEVARD (POLYGON ((-87.6067081256125 41.81681377057218... Grand Boulevard South Side 80586
4 39 KENWOOD (POLYGON ((-87.59215283879394 41.8169293462668... Kenwood South Side 33013

In [9]:
# converts geo coordinates to mercator
def geo_to_mercator(x_lon, y_lat):     
    if abs(x_lon) <= 180 and abs(y_lat) < 90:          
        num = x_lon * 0.017453292519943295         
        x = 6378137.0 * num         
        a = y_lat * 0.017453292519943295          
        x_mercator = x         
        y_mercator = 3189068.5 * math.log((1.0 + math.sin(a)) / (1.0 - math.sin(a)))         
        return x_mercator, y_mercator      
    else:         
        print('Invalid coordinate values for conversion')        
print('Chicago mercator coordinates: ', geo_to_mercator(41.91038,-87.67805))

# convert Chicago geo bounds for mapping
geo_bounds = [41.65, -87.78, 42.02, -87.53] # x,y start, x,y end
start_coord = geo_to_mercator(geo_bounds[0], geo_bounds[1])
end_coord = geo_to_mercator(geo_bounds[2], geo_bounds[3])
print('Chicago area mercator bounds:')
print('start:', start_coord)
print('end:', end_coord)


Chicago mercator coordinates:  (4665442.160552597, -24867271.380684968)
Chicago area mercator bounds:
start: (4636456.791539844, -25153725.69958257)
end: (4677645.003133356, -24472918.736834586)

In [10]:
# create Bokeh geo json data source for mapping
areas_ds = bokeh.models.GeoJSONDataSource(geojson=community_crime_geo_df.to_json())
print('min:', community_crime['Total'].min())
print('max:', community_crime['Total'].max())

# world mercator extent
# mercator_extent = dict(start=-20000000, end=20000000, bounds=None)

# Chicago bounds
x_range = bokeh.models.Range1d(start=start_coord[0], end=end_coord[0]) #**mercator_extent)
y_range = bokeh.models.Range1d(start=start_coord[1], end=end_coord[1]) #**mercator_extent)

# map Chicago community areas with Bokeh
output_notebook()
TOOLS = 'pan,wheel_zoom,reset,hover,save'
fig = figure(title='Chicago Crimes by Community (2001-2017)',
             tools=TOOLS,
             #x_range=x_range,
             #y_range=y_range,
             x_axis_location=None,
             y_axis_location=None,
             responsive=True)
fig.axis.visible = False
fig.grid.grid_line_color = None
#fig.add_tile(STAMEN_TONER)

color_mapper = bokeh.models.LinearColorMapper(
    palette=bokeh.palettes.Spectral5, 
    low=community_crime['Total'].min(), 
    high=community_crime['Total'].max())

fig.patches(xs='xs', ys='ys',
            source=areas_ds,
            fill_color={'field': 'Total', 'transform': color_mapper},
            fill_alpha=0.5,
            line_color='black',
            line_width=0.5)

hover = fig.select_one(bokeh.models.HoverTool)
hover.point_policy = 'follow_mouse'
hover.tooltips = u"""
<div>
  <div class="bokeh_hover_tooltip">@community, @Side</div>
  <div class="bokeh_hover_tooltip">Total Crimes: @Total</div>
</div>
"""
output_file('../maps/chicago-crime-by-community.html')
show(fig)


min: 5605
max: 370680
Loading BokehJS ...
INFO:bokeh.core.state:Session output file '../maps/chicago-crime-by-community.html' already exists, will be overwritten.

In [11]:
# get monthly homicides stats
crime_types = crimes[['PrimaryType']]
homicides = crime_types[(crime_types['PrimaryType']=='HOMICIDE')]
daily_homicides = homicides.resample('D').count().compute()
daily_homicides = daily_homicides.rename(columns={'PrimaryType': 'Homicides'})
daily_homicides['Year'] = daily_homicides.index.year
daily_homicides['Month'] = daily_homicides.index.month
daily_homicides['Date'] = daily_homicides.index
print(daily_homicides.head())
print(daily_homicides.tail(10))


            Homicides  Year  Month       Date
2001-01-01          2  2001      1 2001-01-01
2001-01-02          0  2001      1 2001-01-02
2001-01-03          0  2001      1 2001-01-03
2001-01-04          2  2001      1 2001-01-04
2001-01-05          1  2001      1 2001-01-05
            Homicides  Year  Month       Date
2017-08-16          3  2017      8 2017-08-16
2017-08-17          0  2017      8 2017-08-17
2017-08-18          2  2017      8 2017-08-18
2017-08-19          2  2017      8 2017-08-19
2017-08-20          7  2017      8 2017-08-20
2017-08-21          0  2017      8 2017-08-21
2017-08-22          1  2017      8 2017-08-22
2017-08-23          1  2017      8 2017-08-23
2017-08-24          1  2017      8 2017-08-24
2017-08-25          0  2017      8 2017-08-25

In [12]:
# create holoviews dataset for the heatmap
homicides_dataset = hv.Dataset(daily_homicides, vdims=[('Homicides', 'Homicides')])
homicides_dataset


Out[12]:
:Dataset   [Year,Month,Date]   (Homicides)

In [13]:
%opts HeatMap [width=600 height=480 logz=True fontsize={'xticks': '8pt'}, \
               tools=['hover'] toolbar='above' colorbar=True xrotation=30] (cmap='RdBu_r') 

# create homicides heatmap
homicides_heatmap = hv.HeatMap(homicides_dataset.aggregate(['Year', 'Month'], np.sum), 
                               label='Chicago Homicides Heatmap (2001-2017)')
homicides_heatmap


Out[13]:

In [14]:
%opts Curve [width=300 height=480 yaxis='right'] (line_color='black') {+framewise}

# declare Tap stream with heatmap as source and initial values
posxy = hv.streams.Tap(source=homicides_heatmap, x=2017, y=8)

# histogram tap function
def tap_histogram(x,y):
    return hv.Curve(homicides_dataset.select(Month=y, Year=x), 
                    kdims=['Date'],
                    label='Year %s, Month: %s' % (x, y))

# see http://build.holoviews.org/reference/streams/bokeh/heatmap_tap.html
homicides_heatmap.select(Year=(2001, 2018)) + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])


Out[14]:

In [15]:
homicides_boxwhisker = hv.BoxWhisker(
    homicides_dataset.aggregate(['Year', 'Month'], np.sum), 
    kdims=['Month'], vdims=['Homicides'], 
    label='Monthly Chicago Homicides (2001-2017)')
plot_options = dict(show_legend=False, width=400)
style = dict(color='Month')
homicides_boxwhisker(plot=plot_options, style=style)


Out[15]:

In [16]:
%%time
crimes = crimes.dropna()
crimes.tail()


Wall time: 4.78 s

In [17]:
# reset holoviews plot and style options for all cirmes map
hv.util.opts('Image [width=800 height=400 shared_axes=False logz=True] {+axiswise} ')
hv.util.opts("HLine VLine (color='white' line_width=1) Layout [shared_axes=False] ")
hv.util.opts("Curve [xaxis=None yaxis=None show_grid=False, show_frame=False] (color='orangered') {+framewise}")

# Reproject crime points from Mercator to PlateCarree (latitude/longitude)
# see: http://holoviews.org/gallery/apps/bokeh/nytaxi_hover.html#bokeh-gallery-nytaxi-hover
points = gv.Points(crimes, kdims=['Longitude', 'Latitude'], vdims=[], crs=cartopy_crs.GOOGLE_MERCATOR)
projected = gv.operation.project_points(points, projection=cartopy_crs.PlateCarree())
projected = projected.redim(Longitude='lon', Latitude='lat')

# Use datashader to rasterize and linked streams for interactivity
agg = aggregate(projected, link_inputs=True, x_sampling=0.0001, y_sampling=0.0001)
pointerx = hv.streams.PointerX(x=-74, source=projected)
pointery = hv.streams.PointerY(y=40.8,  source=projected)
vline = hv.DynamicMap(lambda x: hv.VLine(x), streams=[pointerx])
hline = hv.DynamicMap(lambda y: hv.HLine(y), streams=[pointery])

sampled = hv.util.Dynamic(agg, operation=lambda obj, x: obj.sample(lon=x),
                          streams=[pointerx], link_inputs=False)
hvobj = ((hline * vline)) # << sampled.opts(plot={'Curve': dict(width=100)}))
hvobj
#hvobj = ((agg * hline * vline) << sampled.opts(plot={'Curve': dict(width=100)}))

# Obtain Bokeh document and set the title
#doc = hv.renderer('bokeh').server_doc(hvobj)
#doc.title = 'Chicago Crimes FireFly Map Crosshair'


Out[17]:

In [ ]: