In this assignment, you will analyze criminal incident data from Seattle or San Francisco to visualize patterns and, if desired, contrast and compare patterns across the two cities.
Two sets of data files are available
Assignment: Crime Analytics: Visualization of Incident Reports entry by Jerry Thomas
Mostly based on the excellent ProntoData Analysis by Jake Vanderplas ProntoData Analysis
In [40]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
In [4]:
sanfran = pd.read_csv('sanfrancisco_incidents_summer_2014.csv')
pd.DataFrame(sanfran.columns)
Out[4]:
In [5]:
seattle = pd.read_csv('seattle_incidents_summer_2014.csv'
,parse_dates=['Occurred Date or Date Range Start']
,infer_datetime_format=True)
pd.DataFrame(seattle.columns)
Out[5]:
In [6]:
sfo_incident_category = pd.DataFrame(pd.unique(sanfran.Category.ravel()))
sfo_incident_category
Out[6]:
In [7]:
seattle_incident_category = pd.DataFrame(pd.unique(seattle['Summarized Offense Description'].ravel()))
seattle_incident_category
Out[7]:
In [8]:
sanfran.head(5)
Out[8]:
In [9]:
seattle.head(5)
Out[9]:
In [10]:
sanfran_missing_count = {}
for col_name in sanfran.columns:
sanfran_missing_count[col_name] = len(sanfran[sanfran[col_name].isnull()])
sanfran_missing_count
Out[10]:
In [11]:
seattle_missing_count = {}
for col_name in seattle.columns:
seattle_missing_count[col_name] = len(seattle[seattle[col_name].isnull()])
seattle_missing_count
Out[11]:
In [12]:
# the date and time of incident are in two separate columns
# combining them into a date_time column
sanfran['date_time'] = pd.to_datetime(sanfran['Date'] + ' ' + sanfran['Time'])
date_idx = pd.DatetimeIndex(sanfran['date_time'])
sanfran['incident_date'] = date_idx.date.astype('datetime64')
sanfran['incident_hour'] = date_idx.hour
sanfran['incident_year'] = date_idx.year
sanfran['incident_month'] = date_idx.month
sanfran['incident_weekday'] = date_idx.weekday
by_year = sanfran.pivot_table('IncidntNum', aggfunc='count',
index='incident_year',
columns='incident_month')
by_year
Out[12]:
In [13]:
seattle['date_time'] = seattle['Occurred Date or Date Range Start']
date_idx = pd.DatetimeIndex(seattle['date_time'])
seattle['incident_date'] = date_idx.date.astype('datetime64')
seattle['incident_hour'] = date_idx.hour
seattle['incident_year'] = date_idx.year
seattle['incident_month'] = date_idx.month
seattle['incident_weekday'] = date_idx.weekday
by_year = seattle.pivot_table('General Offense Number', aggfunc='count',
index='incident_year',
columns='incident_month')
by_year
Out[13]:
There is only one year and data spans for the three months of Jun-Aug. Looks consistent although there is no way to confirm.
http://www.legalmatch.com/law-library/article/what-are-the-different-types-of-crimes.html
I took the 4 prominent categories and remapped them to the best of my knowledge.
See more at: http://www.legalmatch.com/law-library/article/what-are-the-different-types-of-crimes.html#sthash.xHKGRbs4.dpuf
In [22]:
map_categories = {
'[INC - CASE DC USE ONLY]': 'OTHER OFFENSES',
'ANIMAL COMPLAINT': 'OTHER OFFENSES',
'ARSON': 'PROPERTY',
'ASSAULT': 'PERSONAL',
'BIAS INCIDENT': 'INCHOATE',
'BIKE THEFT': 'PROPERTY',
'BRIBERY': 'PROPERTY',
'BURGLARY': 'PROPERTY',
'BURGLARY-SECURE PARKING-RES': 'PROPERTY',
'CAR PROWL': 'INCHOATE',
'COUNTERFEIT': 'PROPERTY',
'DISORDERLY CONDUCT': 'INCHOATE',
'DISPUTE': 'INCHOATE',
'DISTURBANCE': 'INCHOATE',
'DRIVING UNDER THE INFLUENCE': 'STATUTORY',
'DRUG/NARCOTIC': 'STATUTORY',
'DRUNKENNESS': 'STATUTORY',
'DUI': 'STATUTORY',
'ELUDING': 'INCHOATE',
'EMBEZZLE': 'PROPERTY',
'EMBEZZLEMENT': 'PROPERTY',
'ESCAPE': 'STATUTORY',
'EXTORTION': 'INCHOATE',
'FALSE REPORT': 'INCHOATE',
'FAMILY OFFENSES': 'INCHOATE',
'FIREWORK': 'PROPERTY',
'FORGERY': 'PROPERTY',
'FORGERY/COUNTERFEITING': 'PROPERTY',
'FRAUD': 'PROPERTY',
'GAMBLING': 'PROPERTY',
'HOMICIDE': 'PERSONAL',
'ILLEGAL DUMPING': 'STATUTORY',
'INJURY': 'PERSONAL',
'KIDNAPPING': 'PERSONAL',
'LARCENY/THEFT': 'PROPERTY',
'LIQUOR LAWS': 'STATUTORY',
'LIQUOR VIOLATION': 'STATUTORY',
'LOITERING': 'INCHOATE',
'LOST PROPERTY': 'PROPERTY',
'MAIL THEFT': 'PROPERTY',
'MISSING PERSON': 'PERSONAL',
'NARCOTICS': 'STATUTORY',
'NON-CRIMINAL': 'NON-CRIMINAL',
'OBSTRUCT': 'PROPERTY',
'OTHER OFFENSES': 'OTHER OFFENSES',
'OTHER PROPERTY': 'PROPERTY',
'PICKPOCKET': 'PROPERTY',
'PORNOGRAPHY': 'INCHOATE',
'PORNOGRAPHY/OBSCENE MAT': 'INCHOATE',
'PROPERTY DAMAGE': 'PROPERTY',
'PROSTITUTION': 'INCHOATE',
'PUBLIC NUISANCE': 'INCHOATE',
'PURSE SNATCH': 'PROPERTY',
'RECKLESS BURNING': 'PROPERTY',
'RECOVERED PROPERTY': 'PROPERTY',
'ROBBERY': 'PROPERTY',
'RUNAWAY': 'INCHOATE',
'SECONDARY CODES': 'STATUTORY',
'SHOPLIFTING': 'PROPERTY',
'STOLEN PROPERTY': 'PROPERTY',
'SUICIDE': 'PERSONAL',
'SUSPICIOUS OCC': 'PROPERTY',
'THEFT OF SERVICES': 'PROPERTY',
'THREATS': 'INCHOATE',
'TRAFFIC': 'STATUTORY',
'TRESPASS': 'INCHOATE',
'VANDALISM': 'PROPERTY',
'VEHICLE THEFT': 'PROPERTY',
'VIOLATION OF COURT ORDER': 'STATUTORY',
'WARRANT ARREST': 'STATUTORY',
'WARRANTS': 'STATUTORY',
'WEAPON': 'STATUTORY',
'WEAPON LAWS': 'STATUTORY'
}
# Map the incident codes to a smaller set
seattle['incident_category'] = seattle['Summarized Offense Description'].apply(lambda col: map_categories[col])
sanfran['incident_category'] = sanfran['Category'].apply(lambda col: map_categories[col])
In [78]:
by_date = seattle.pivot_table('General Offense Number', aggfunc='count',
index='incident_date',
columns='incident_category' )
ax = by_date.plot()
ax.figure.savefig('figs/seattle_incidents_by_date.png', bbox_inches='tight')
In [77]:
by_weekday = seattle.pivot_table('General Offense Number', aggfunc='count',
index='incident_weekday',
columns='incident_category')
ax = by_weekday.plot()
ax.figure.savefig('figs/incidents_by_weekday.png', bbox_inches='tight')
In [75]:
by_hour = seattle.pivot_table('General Offense Number', aggfunc='count',
index='incident_hour',
columns='incident_category')
ax = by_hour.plot()
ax.figure.savefig('figs/incidents_by_hour.png', bbox_inches='tight')
In [79]:
by_date = sanfran.pivot_table('IncidntNum', aggfunc='count',
index='incident_date',
columns='incident_category' )
ax = by_date.plot()
ax.figure.savefig('figs/sanfran_incidents_by_date.png', bbox_inches='tight')
In [81]:
by_weekday = sanfran.pivot_table('IncidntNum', aggfunc='count',
index='incident_weekday',
columns='incident_category' )
ax = by_weekday.plot()
ax.figure.savefig('figs/sanfran_incidents_by_weekday.png', bbox_inches='tight')
In [82]:
by_hour = sanfran.pivot_table('IncidntNum', aggfunc='count',
index='incident_hour',
columns='incident_category' )
ax = by_hour.plot()
ax.figure.savefig('figs/sanfran_incidents_by_hour.png', bbox_inches='tight')