notebook.community

Edit and run



In [1]:

    
import datashader as ds
import dask.dataframe as dd

import bokeh.models
from bokeh.charts import HeatMap
from bokeh.plotting import (
    figure, show, output_file, output_notebook)
from bokeh.tile_providers import STAMEN_TONER

import holoviews as hv
from holoviews.operation.datashader import datashade
from holoviews.operation.datashader import aggregate
from holoviews.operation import decimate
from holoviews import streams
from holoviews.streams import RangeXY, PlotSize

# geo libs imports
import geopandas as gpd
import geoviews as gv
import cartopy.crs as cartopy_crs

import math
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

import numpy as np 
import pandas as pd
import seaborn as sns

import os

# version imports
from IPython import __version__ as ipython_version
from pandas import __version__ as pandas_version
from bokeh import __version__ as bokeh_version

from IPython.core.display import Markdown









    



D:\tools\dev\python\Anaconda3-4.2.0\lib\site-packages\odo\backends\pandas.py:94: FutureWarning: pandas.tslib is deprecated and will be removed in a future version.
You can access NaTType as type(pandas.NaT)
  @convert.register((pd.Timestamp, pd.Timedelta), (pd.tslib.NaTType, type(None)))
D:\tools\dev\python\Anaconda3-4.2.0\lib\site-packages\seaborn\apionly.py:6: UserWarning: As seaborn no longer sets a default style on import, the seaborn.apionly module is deprecated. It will be removed in a future version.
  warnings.warn(msg, UserWarning)



In [2]:

    
Markdown(open('README.md').read())









    Out[2]:




Chicago Crimes Notebooks
This repository contains a number of notebooks for exploring Chicago crimes since 2001 to present (August 2017).
csv-data-preview.ipynb
CSV data preview notebook contains raw CSV data preview code for 2017 Chicago crime data,
such as number of reported crimes, number of arrests and domestic crime reports,
unique column value counts, etc. to get a feel for the crime data structure 
and potential insights that can be harvested from it.
CSV data in this notebook is loaded with dask. More info on Dask framework here:
http://dask.pydata.org/en/latest/
Dask is just like pandas (most commonly used data munging framework in Python), 
but more suited to working with large distributed data sets.
crime-plots.ipynb
Crimes plots notebook contains matplotlib charts for 2017 Chicago crime data.
all-chicago-crime-charts.ipynb
This notebook loads large Chicago crimes dataset (~1.4Gb of raw data, ~219Mb compressed) 
with all crimes data recorded since 2001 to present (August 2017).
A variety of matplotlib charts in this notebook show decline of Chicago crime over time,
as well as some crime location data for futher insights.
interactive-chicago-crime-charts.ipynb
Per description this notebook will contain interactive Bokeh plots 
that will be packaged and deployed to heroku most likely 
for public Chicago crimes data visualizations preview.
This part is currently in dev with an ETA of live data viz in September, 2017.
This is Not Yet Another Chicago Crimes Story Telling Journal
This collection of notebooks on Chicago crime was put together strictly
for code and data visualization demo purpose with dask 
and open source Python charting libraries.
Therefore, these notebooks do not contain the usual commentary on visualized
data insights interleaved with code and charts to avoid introducing any bias 
in this short exploratory data analysis (EDA) study.
Many factors affect metropolitan area crimes, including weather, 
employment, education and poverty levels, racial mix too, that I plan to explore later,
federal government policy changes over time, local government changes, etc.
I simply wanted to put together a set of notebooks that depict 
public Chicago crime data over time, and slice and dice it across 
different data dimensions for display.
You can certainly draw some conclusions from them on your own, 
depending on your area of interest in this massive crime data set.
Introduction to Bokeh
Flip through these tutorial notebooks for a good intro to Bokeh:
http://nbviewer.jupyter.org/github/bokeh/bokeh-notebooks/blob/master/tutorial/00%20-%20intro.ipynb



In [3]:

    
print('Required Python libraries:')
print('IPython - %s' % ipython_version)
print('Pandas - %s' % pandas_version)
print('Bokeh - %s' % bokeh_version)









    



Required Python libraries:
IPython - 5.1.0
Pandas - 0.20.3
Bokeh - 0.12.4



In [4]:

    
%matplotlib inline
#%config InlineBackend.figure_format = 'svg'

# set neat seaborn whitegrid styles for matplotlib charts
plt.style.use('seaborn')
sns.set_style('whitegrid')

# config holoviews for bokeh charts
hv.extension('bokeh')



In [5]:

    
%%time

# set parquet data folder path
parquet_data_folder = '../data/crimes-2001-to-present.snappy.parq'
print('Loading crime data from: {}'.format(parquet_data_folder))

# load crimes parquet data into dask df
crimes = dd.read_parquet(parquet_data_folder, index='Date')

# load all data into memory
crimes = crimes.persist()
print('Crime data loaded into memory.')









    



Loading crime data from: ../data/crimes-2001-to-present.snappy.parq
Crime data loaded into memory.
Wall time: 12.5 s



In [6]:

    
# get Chicago community areas geo data
areas = gpd.read_file('../data/chicago-community-areas.geojson')

# drop unused columns
areas = areas.drop(['area', 'area_num_1', 'comarea', 'comarea_id',
                    'shape_area', 'shape_len', 'perimeter'], axis=1)

# rename area_numbe and convert it to int type 
# for crime dataframe merge on community area # later
areas = areas.rename(columns={'area_numbe': 'CommunityArea'})
areas['CommunityArea'] = areas['CommunityArea'].astype(np.int64)

print('Chicago Community Areas:')
print(areas.head())
print('...\nTotal Community Areas: {:,}\n...'.format(len(areas)))
areas.info()









    



WARNING:Fiona:GDAL data files not located, GDAL_DATA not set
WARNING:Fiona:PROJ data files not located, PROJ_LIB not set






    



Chicago Community Areas:
   CommunityArea        community  \
0             35          DOUGLAS   
1             36          OAKLAND   
2             37      FULLER PARK   
3             38  GRAND BOULEVARD   
4             39          KENWOOD   

                                            geometry  
0  (POLYGON ((-87.60914087617894 41.8446925026539...  
1  (POLYGON ((-87.59215283879394 41.8169293462668...  
2  (POLYGON ((-87.62879823733725 41.8018930336891...  
3  (POLYGON ((-87.6067081256125 41.81681377057218...  
4  (POLYGON ((-87.59215283879394 41.8169293462668...  
...
Total Community Areas: 77
...
<class 'geopandas.geodataframe.GeoDataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 3 columns):
CommunityArea    77 non-null int64
community        77 non-null object
geometry         77 non-null object
dtypes: int64(1), object(2)
memory usage: 1.9+ KB



In [7]:

    
# load Chicago community areas with sides info
# for plotting crime by Chicago 'sides'
community_areas = pd.read_csv('../data/chicago-community-areas.csv') #, index_col='CommunityName')
community_areas.head()

# get community crime stats
community_areas['Total'] = crimes.groupby('CommunityArea').size().compute().rename('Total')
community_crime = community_areas.dropna()

# print community crime stats
print('High Chicago Crime Communities:')
print(community_crime.sort_values(by='Total', ascending=False).head())
print('...\nTotal Communities: {:,}\n...'.format(len(community_crime)))
community_crime.info()









    



High Chicago Crime Communities:
    CommunityArea    CommunityName        Side   Total
25             25           Austin   West Side  370680
8               8  Near North Side     Central  192403
43             43      South Shore  South Side  189126
23             23    Humboldt Park   West Side  183219
24             24        West Town   West Side  171884
...
Total Communities: 77
...
<class 'pandas.core.frame.DataFrame'>
Int64Index: 77 entries, 1 to 77
Data columns (total 4 columns):
CommunityArea    77 non-null int64
CommunityName    77 non-null object
Side             77 non-null object
Total            77 non-null int64
dtypes: int64(2), object(2)
memory usage: 3.0+ KB






    



D:\tools\dev\python\Anaconda3-4.2.0\lib\site-packages\pandas\core\indexes\category.py:138: RuntimeWarning: Values and categories have different dtypes. Did you mean to use
'Categorical.from_codes(codes, categories)'?
  data = Categorical(data, categories=categories, ordered=ordered)



In [8]:

    
# merge areas geo dataframe with community crime data for mapping
community_crime_geo_df = areas.merge(community_crime, on='CommunityArea')
community_crime_geo_df.head()









    Out[8]:







  
    
      
      CommunityArea
      community
      geometry
      CommunityName
      Side
      Total
    
  
  
    
      0
      35
      DOUGLAS
      (POLYGON ((-87.60914087617894 41.8446925026539...
      Douglas
      South Side
      64059
    
    
      1
      36
      OAKLAND
      (POLYGON ((-87.59215283879394 41.8169293462668...
      Oakland
      South Side
      12661
    
    
      2
      37
      FULLER PARK
      (POLYGON ((-87.62879823733725 41.8018930336891...
      Fuller Park
      South Side
      19594
    
    
      3
      38
      GRAND BOULEVARD
      (POLYGON ((-87.6067081256125 41.81681377057218...
      Grand Boulevard
      South Side
      80586
    
    
      4
      39
      KENWOOD
      (POLYGON ((-87.59215283879394 41.8169293462668...
      Kenwood
      South Side
      33013



In [9]:

    
# converts geo coordinates to mercator
def geo_to_mercator(x_lon, y_lat):     
    if abs(x_lon) <= 180 and abs(y_lat) < 90:          
        num = x_lon * 0.017453292519943295         
        x = 6378137.0 * num         
        a = y_lat * 0.017453292519943295          
        x_mercator = x         
        y_mercator = 3189068.5 * math.log((1.0 + math.sin(a)) / (1.0 - math.sin(a)))         
        return x_mercator, y_mercator      
    else:         
        print('Invalid coordinate values for conversion')        
print('Chicago mercator coordinates: ', geo_to_mercator(41.91038,-87.67805))

# convert Chicago geo bounds for mapping
geo_bounds = [41.65, -87.78, 42.02, -87.53] # x,y start, x,y end
start_coord = geo_to_mercator(geo_bounds[0], geo_bounds[1])
end_coord = geo_to_mercator(geo_bounds[2], geo_bounds[3])
print('Chicago area mercator bounds:')
print('start:', start_coord)
print('end:', end_coord)









    



Chicago mercator coordinates:  (4665442.160552597, -24867271.380684968)
Chicago area mercator bounds:
start: (4636456.791539844, -25153725.69958257)
end: (4677645.003133356, -24472918.736834586)



In [10]:

    
# create Bokeh geo json data source for mapping
areas_ds = bokeh.models.GeoJSONDataSource(geojson=community_crime_geo_df.to_json())
print('min:', community_crime['Total'].min())
print('max:', community_crime['Total'].max())

# world mercator extent
# mercator_extent = dict(start=-20000000, end=20000000, bounds=None)

# Chicago bounds
x_range = bokeh.models.Range1d(start=start_coord[0], end=end_coord[0]) #**mercator_extent)
y_range = bokeh.models.Range1d(start=start_coord[1], end=end_coord[1]) #**mercator_extent)

# map Chicago community areas with Bokeh
output_notebook()
TOOLS = 'pan,wheel_zoom,reset,hover,save'
fig = figure(title='Chicago Crimes by Community (2001-2017)',
             tools=TOOLS,
             #x_range=x_range,
             #y_range=y_range,
             x_axis_location=None,
             y_axis_location=None,
             responsive=True)
fig.axis.visible = False
fig.grid.grid_line_color = None
#fig.add_tile(STAMEN_TONER)

color_mapper = bokeh.models.LinearColorMapper(
    palette=bokeh.palettes.Spectral5, 
    low=community_crime['Total'].min(), 
    high=community_crime['Total'].max())

fig.patches(xs='xs', ys='ys',
            source=areas_ds,
            fill_color={'field': 'Total', 'transform': color_mapper},
            fill_alpha=0.5,
            line_color='black',
            line_width=0.5)

hover = fig.select_one(bokeh.models.HoverTool)
hover.point_policy = 'follow_mouse'
hover.tooltips = u"""
<div>
  <div class="bokeh_hover_tooltip">@community, @Side</div>
  <div class="bokeh_hover_tooltip">Total Crimes: @Total</div>
</div>
"""
output_file('../maps/chicago-crime-by-community.html')
show(fig)









    



min: 5605
max: 370680






    





    
        
        Loading BokehJS ...
    






    














    



INFO:bokeh.core.state:Session output file '../maps/chicago-crime-by-community.html' already exists, will be overwritten.



In [11]:

    
# get monthly homicides stats
crime_types = crimes[['PrimaryType']]
homicides = crime_types[(crime_types['PrimaryType']=='HOMICIDE')]
daily_homicides = homicides.resample('D').count().compute()
daily_homicides = daily_homicides.rename(columns={'PrimaryType': 'Homicides'})
daily_homicides['Year'] = daily_homicides.index.year
daily_homicides['Month'] = daily_homicides.index.month
daily_homicides['Date'] = daily_homicides.index
print(daily_homicides.head())
print(daily_homicides.tail(10))









    



            Homicides  Year  Month       Date
2001-01-01          2  2001      1 2001-01-01
2001-01-02          0  2001      1 2001-01-02
2001-01-03          0  2001      1 2001-01-03
2001-01-04          2  2001      1 2001-01-04
2001-01-05          1  2001      1 2001-01-05
            Homicides  Year  Month       Date
2017-08-16          3  2017      8 2017-08-16
2017-08-17          0  2017      8 2017-08-17
2017-08-18          2  2017      8 2017-08-18
2017-08-19          2  2017      8 2017-08-19
2017-08-20          7  2017      8 2017-08-20
2017-08-21          0  2017      8 2017-08-21
2017-08-22          1  2017      8 2017-08-22
2017-08-23          1  2017      8 2017-08-23
2017-08-24          1  2017      8 2017-08-24
2017-08-25          0  2017      8 2017-08-25



In [12]:

    
# create holoviews dataset for the heatmap
homicides_dataset = hv.Dataset(daily_homicides, vdims=[('Homicides', 'Homicides')])
homicides_dataset









    Out[12]:





:Dataset   [Year,Month,Date]   (Homicides)



In [13]:

    
%opts HeatMap [width=600 height=480 logz=True fontsize={'xticks': '8pt'}, \
               tools=['hover'] toolbar='above' colorbar=True xrotation=30] (cmap='RdBu_r') 

# create homicides heatmap
homicides_heatmap = hv.HeatMap(homicides_dataset.aggregate(['Year', 'Month'], np.sum), 
                               label='Chicago Homicides Heatmap (2001-2017)')
homicides_heatmap









    Out[13]:



In [14]:

    
%opts Curve [width=300 height=480 yaxis='right'] (line_color='black') {+framewise}

# declare Tap stream with heatmap as source and initial values
posxy = hv.streams.Tap(source=homicides_heatmap, x=2017, y=8)

# histogram tap function
def tap_histogram(x,y):
    return hv.Curve(homicides_dataset.select(Month=y, Year=x), 
                    kdims=['Date'],
                    label='Year %s, Month: %s' % (x, y))

# see http://build.holoviews.org/reference/streams/bokeh/heatmap_tap.html
homicides_heatmap.select(Year=(2001, 2018)) + hv.DynamicMap(tap_histogram, kdims=[], streams=[posxy])









    Out[14]:



In [15]:

    
homicides_boxwhisker = hv.BoxWhisker(
    homicides_dataset.aggregate(['Year', 'Month'], np.sum), 
    kdims=['Month'], vdims=['Homicides'], 
    label='Monthly Chicago Homicides (2001-2017)')
plot_options = dict(show_legend=False, width=400)
style = dict(color='Month')
homicides_boxwhisker(plot=plot_options, style=style)









    Out[15]:



In [16]:

    
%%time
crimes = crimes.dropna()
crimes.tail()









    



Wall time: 4.78 s



In [17]:

    
# reset holoviews plot and style options for all cirmes map
hv.util.opts('Image [width=800 height=400 shared_axes=False logz=True] {+axiswise} ')
hv.util.opts("HLine VLine (color='white' line_width=1) Layout [shared_axes=False] ")
hv.util.opts("Curve [xaxis=None yaxis=None show_grid=False, show_frame=False] (color='orangered') {+framewise}")

# Reproject crime points from Mercator to PlateCarree (latitude/longitude)
# see: http://holoviews.org/gallery/apps/bokeh/nytaxi_hover.html#bokeh-gallery-nytaxi-hover
points = gv.Points(crimes, kdims=['Longitude', 'Latitude'], vdims=[], crs=cartopy_crs.GOOGLE_MERCATOR)
projected = gv.operation.project_points(points, projection=cartopy_crs.PlateCarree())
projected = projected.redim(Longitude='lon', Latitude='lat')

# Use datashader to rasterize and linked streams for interactivity
agg = aggregate(projected, link_inputs=True, x_sampling=0.0001, y_sampling=0.0001)
pointerx = hv.streams.PointerX(x=-74, source=projected)
pointery = hv.streams.PointerY(y=40.8,  source=projected)
vline = hv.DynamicMap(lambda x: hv.VLine(x), streams=[pointerx])
hline = hv.DynamicMap(lambda y: hv.HLine(y), streams=[pointery])

sampled = hv.util.Dynamic(agg, operation=lambda obj, x: obj.sample(lon=x),
                          streams=[pointerx], link_inputs=False)
hvobj = ((hline * vline)) # << sampled.opts(plot={'Curve': dict(width=100)}))
hvobj
#hvobj = ((agg * hline * vline) << sampled.opts(plot={'Curve': dict(width=100)}))

# Obtain Bokeh document and set the title
#doc = hv.renderer('bokeh').server_doc(hvobj)
#doc.title = 'Chicago Crimes FireFly Map Crosshair'









    Out[17]:



In [ ]:

	CommunityArea	community	geometry	CommunityName	Side	Total
0	35	DOUGLAS	(POLYGON ((-87.60914087617894 41.8446925026539...	Douglas	South Side	64059
1	36	OAKLAND	(POLYGON ((-87.59215283879394 41.8169293462668...	Oakland	South Side	12661
2	37	FULLER PARK	(POLYGON ((-87.62879823733725 41.8018930336891...	Fuller Park	South Side	19594
3	38	GRAND BOULEVARD	(POLYGON ((-87.6067081256125 41.81681377057218...	Grand Boulevard	South Side	80586
4	39	KENWOOD	(POLYGON ((-87.59215283879394 41.8169293462668...	Kenwood	South Side	33013