In [1]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import matplotlib
import seaborn as sns
sns.set_style("darkgrid")
sns.set_context("poster")
In [2]:
incidents = pd.read_csv('sanfrancisco_incidents_summer_2014.csv')
incidents.columns = ['Id','Category','Description','DayOfWeek','Date','Time','District','Resolution','Address','Longitude','Latitude','Location','PdId']
pd.DataFrame(incidents.columns)
Out[2]:
In [3]:
sfo_incident_category = pd.DataFrame(pd.unique(incidents.Category.ravel()))
sfo_incident_category
Out[3]:
In [4]:
incidents.head(5)
Out[4]:
In [5]:
missing_count = {}
for col_name in incidents.columns:
count_of_null = len(incidents[incidents[col_name].isnull()])
if count_of_null > 0:
missing_count[col_name] = count_of_null
if len(missing_count) == 0:
print "All columns have data for all rows"
else:
print "List of columns and the count of rows which have missing data"
pd.DataFrame(missing_count)
In [7]:
# the date and time of incident are in two separate columns
# combining them into a date_time column
incidents['DateTime'] = pd.to_datetime(incidents['Date'] + ' ' + incidents['Time'])
date_idx = pd.DatetimeIndex(incidents['DateTime'])
#incidents['Date'] = date_idx.date.astype('datetime64')
incidents['Hour'] = date_idx.hour
incidents['Year'] = date_idx.year
incidents['Month'] = date_idx.month
incidents['Weekday'] = date_idx.weekday
In [8]:
pd.DataFrame({'count' : incidents.groupby( ['Year','Month'] ).size()}).reset_index()
Out[8]:
There is only one year and data spans for the three months of Jun-Aug. Looks consistent although there is no way to confirm.
In [9]:
by_year = incidents.pivot_table('Id', aggfunc='count',
index='Weekday',
columns='District')
ax = by_year.plot(kind="line",sort_columns=True)
http://www.legalmatch.com/law-library/article/what-are-the-different-types-of-crimes.html
I took the 4 prominent categories and remapped them to the best of my knowledge.
See more at: http://www.legalmatch.com/law-library/article/what-are-the-different-types-of-crimes.html#sthash.xHKGRbs4.dpuf
In [10]:
category_groups = {
'[INC - CASE DC USE ONLY]': 'OTHER OFFENSES',
'ANIMAL COMPLAINT': 'OTHER OFFENSES',
'ARSON': 'PROPERTY',
'ASSAULT': 'PERSONAL',
'BIAS INCIDENT': 'INCHOATE',
'BIKE THEFT': 'PROPERTY',
'BRIBERY': 'PROPERTY',
'BURGLARY': 'PROPERTY',
'BURGLARY-SECURE PARKING-RES': 'PROPERTY',
'CAR PROWL': 'INCHOATE',
'COUNTERFEIT': 'PROPERTY',
'DISORDERLY CONDUCT': 'INCHOATE',
'DISPUTE': 'INCHOATE',
'DISTURBANCE': 'INCHOATE',
'DRIVING UNDER THE INFLUENCE': 'STATUTORY',
'DRUG/NARCOTIC': 'STATUTORY',
'DRUNKENNESS': 'STATUTORY',
'DUI': 'STATUTORY',
'ELUDING': 'INCHOATE',
'EMBEZZLE': 'PROPERTY',
'EMBEZZLEMENT': 'PROPERTY',
'ESCAPE': 'STATUTORY',
'EXTORTION': 'INCHOATE',
'FALSE REPORT': 'INCHOATE',
'FAMILY OFFENSES': 'INCHOATE',
'FIREWORK': 'PROPERTY',
'FORGERY': 'PROPERTY',
'FORGERY/COUNTERFEITING': 'PROPERTY',
'FRAUD': 'PROPERTY',
'GAMBLING': 'PROPERTY',
'HOMICIDE': 'PERSONAL',
'ILLEGAL DUMPING': 'STATUTORY',
'INJURY': 'PERSONAL',
'KIDNAPPING': 'PERSONAL',
'LARCENY/THEFT': 'PROPERTY',
'LIQUOR LAWS': 'STATUTORY',
'LIQUOR VIOLATION': 'STATUTORY',
'LOITERING': 'INCHOATE',
'LOST PROPERTY': 'PROPERTY',
'MAIL THEFT': 'PROPERTY',
'MISSING PERSON': 'PERSONAL',
'NARCOTICS': 'STATUTORY',
'NON-CRIMINAL': 'NON-CRIMINAL',
'OBSTRUCT': 'PROPERTY',
'OTHER OFFENSES': 'OTHER OFFENSES',
'OTHER PROPERTY': 'PROPERTY',
'PICKPOCKET': 'PROPERTY',
'PORNOGRAPHY': 'INCHOATE',
'PORNOGRAPHY/OBSCENE MAT': 'INCHOATE',
'PROPERTY DAMAGE': 'PROPERTY',
'PROSTITUTION': 'INCHOATE',
'PUBLIC NUISANCE': 'INCHOATE',
'PURSE SNATCH': 'PROPERTY',
'RECKLESS BURNING': 'PROPERTY',
'RECOVERED PROPERTY': 'PROPERTY',
'ROBBERY': 'PROPERTY',
'RUNAWAY': 'INCHOATE',
'SECONDARY CODES': 'STATUTORY',
'SHOPLIFTING': 'PROPERTY',
'STOLEN PROPERTY': 'PROPERTY',
'SUICIDE': 'PERSONAL',
'SUSPICIOUS OCC': 'PROPERTY',
'THEFT OF SERVICES': 'PROPERTY',
'THREATS': 'INCHOATE',
'TRAFFIC': 'STATUTORY',
'TRESPASS': 'INCHOATE',
'VANDALISM': 'PROPERTY',
'VEHICLE THEFT': 'PROPERTY',
'VIOLATION OF COURT ORDER': 'STATUTORY',
'WARRANT ARREST': 'STATUTORY',
'WARRANTS': 'STATUTORY',
'WEAPON': 'STATUTORY',
'WEAPON LAWS': 'STATUTORY'
}
# Map the incident codes to a smaller set
incidents['CategoryGroup'] = incidents['Category'].apply(lambda col: category_groups[col])
In [24]:
#count_by_ctg = sanfran.groupby(['Category']).size()
#count_by_ctg.sort_values(ascending=True,inplace=True)
#ax = count_by_ctg.plot(kind="barh",sort_columns=True)
count_by_category = pd.DataFrame({'count' : incidents.groupby( ['Category'] ).size()}).reset_index()
count_by_category.sort_values(by='count',ascending= True,inplace=True)
count_by_category['category_idx'] = list(range(len(count_by_category.index)))
ax = count_by_category.plot(kind="barh",x='Category', y='count',sort_columns=True)
#df.groupby(key_columns).size()
In [25]:
#drunken = sanfran[sanfran['Category'] == 'DRUNKENNESS']
#count_by_district = sanfran.groupby(['PdDistrict']).size()
#count_by_district.sort_values(ascending=True,inplace=True)
count_by_district = pd.DataFrame({'count' : incidents.groupby( ['District'] ).size()}).reset_index()
count_by_district.sort_values(by='count',ascending= True,inplace=True)
count_by_district['district_idx'] = list(range(len(count_by_district.index)))
ax = count_by_district.plot(kind="barh",x='District', y='count',sort_columns=True)
In [26]:
#count_by_district = pd.DataFrame({'count' : incidents.groupby( ['District','Hour'] ).size()}).reset_index()
#count_by_district.sort_values(by='count',ascending= True,inplace=True)
#count_by_district['district_idx'] = list(range(len(count_by_district.index)))
#count_by_district
#ax = count_by_district.plot() #kind="barh",x='District', y='count',sort_columns=True)
In [37]:
categories = dict(zip(count_by_category.Category,count_by_category.category_idx))
districts = dict(zip(count_by_district['District'], count_by_district['district_idx']))
districts
Out[37]:
In [42]:
ctg_x_district = pd.DataFrame({'count' : incidents.groupby( ['District', 'Category'] ).size()}).reset_index()
ctg_x_district.columns
ctg_x_district['district_idx'] = ctg_x_district['District'].apply(lambda col: districts[col])
ctg_x_district['category_idx'] = ctg_x_district['Category'].apply(lambda col: categories[col])
ctg_x_district
ax = ctg_x_district.plot(kind='scatter',x=3,y=4,s=ctg_x_district['count']*3)
ax.set_ylim(-1,35)
ax.set_xlim(0,9)
#ax.set_xticklabels(count_by_district['PdDistrict'])
#by_date = sanfran.pivot_table('IncidntNum', aggfunc='count',
# index='Category',
# columns='PdDistrict' )
#ctg_x_district.plot(kind='scatter',x=1,y=2)
#g = sns.FacetGrid(ctg_x_district, row="Category", col="incident_hour")
#g.map(plt.hist, "count")
Out[42]:
In [16]:
count_by_category
Out[16]:
In [ ]:
plt.style.available
In [ ]:
district
In [17]:
ctg_x_dist_x_weekday = pd.DataFrame({'count' : incidents.groupby( ['District', 'Category','Hour'] ).size()}).reset_index()
In [18]:
#ctg_x_district.columns
district = ctg_x_dist_x_weekday[ctg_x_dist_x_weekday['District'] == 'CENTRAL']
g = sns.FacetGrid(district, row="Category", col="District")
g.map(plt.scatter, "Hour", "count")
In [23]:
In [43]:
filtered = incidents[incidents['District'] == 'CENTRAL']
#filtered = prostitution[prostitution['District'] == 'CENTRAL]
by_district_hour = pd.DataFrame({'count' : filtered.groupby( ['Category','Hour'] ).size()}).reset_index()
#by_district_hour.sort_values(by='count',ascending= True,inplace=True)
#by_district_hour
by_district_hour.plot(x='Hour',y='count')
Out[43]:
In [ ]:
ctg_city = incidents.groupby(['District','Category']).size()
incidents.plot(kind='scatter', x='District', y='Category');
In [ ]:
by_date = incidents.pivot_table('Id', aggfunc='count',
index='Date',
columns='Category' )
ax = by_date.plot()
ax.figure.savefig('figs/sfo_incidents_by_date.png', bbox_inches='tight')
In [ ]:
by_weekday = incidents.pivot_table('Id', aggfunc='count',
index='Weekday',
columns='Wategory' )
ax = by_weekday.plot()
ax.figure.savefig('figs/sfo_incidents_by_weekday.png', bbox_inches='tight')
In [ ]:
by_hour = incidents.pivot_table('Id', aggfunc='count',
index='Hour',
columns='Category' )
ax = by_hour.plot()
ax.figure.savefig('figs/sfo_incidents_by_hour.png', bbox_inches='tight')
In [ ]:
property_theft = sanfran[sanfran['incident_category'] == 'PROPERTY']
inchoate = sanfran[sanfran['incident_category'] == 'INCHOATE']
personal = sanfran[sanfran['incident_category'] == 'PERSONAL']
non_criminal = sanfran[sanfran['incident_category'] == 'NON-CRIMINAL']
statutory = sanfran[sanfran['incident_category'] == 'STATUTORY']
In [ ]:
by_date = incidents.pivot_table('Id', aggfunc='count',
index='Date',
columns='Category' )
ax = by_date.plot()
ax.figure.savefig('figs/sfo_by_date_and_category.png', bbox_inches='tight')
In [ ]:
by_weekday = incidents.pivot_table('Id', aggfunc='count',
index='Weekday',
columns='Category' )
ax = by_weekday.plot()
ax.figure.savefig('figs/sfo_by_weekday_and_category.png', bbox_inches='tight')
In [ ]:
by_hour = incidents.pivot_table('Id', aggfunc='count',
index='Hour',
columns='Category' )
ax = by_hour.plot()
ax.figure.savefig('figs/sfo_incidents_by_hour.png', bbox_inches='tight')
In [ ]:
state_topo = r'maps/us_states.topo.json'
county_topo = r'maps/us_counties.topo.json'
geo_data = [{'name': 'counties',
'url': county_topo,
'feature': 'us_counties.geo'}
,{'name': 'states',
'url': state_topo,
'feature': 'us_states.geo'}]
vis = Map( geo_data=geo_data
, scale=4000
, translate=[1480, 830]
, projection='albersUsa'
, map_key={'wa_counties': 'properties.COUNTY'})
del vis.marks[1].properties.update
vis.marks[0].properties.update.fill.value = '#084081'
vis.marks[1].properties.enter.stroke.value = '#fff'
vis.marks[0].properties.enter.stroke.value = '#7bccc4'
vis.display()