In [154]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib
%pylab inline
pylab.rcParams['figure.figsize'] = (15, 8)
import heapq
matplotlib.style.use('ggplot')
sf = pd.DataFrame.from_csv('sanfrancisco_incidents_summer_2014.csv')
se = pd.DataFrame.from_csv('seattle_incidents_summer_2014.csv')
se['Time']=pd.to_datetime(se['Date Reported'],errors='coerce')
se.index=se['Time']
se=se[(se.index.month>5)&(se.index.month<9)]
se=se[(se.index.year<2015)]
len(se)
Out[154]:
In [155]:
se.columns = [c.replace(' ', '_') for c in se.columns]
se.columns = [c.replace('/', '_') for c in se.columns]
list(se.columns.values)
Out[155]:
In [156]:
list(sf.columns.values)
Out[156]:
In [157]:
se.Summarized_Offense_Description.unique()
Out[157]:
In [158]:
sf.Category.unique()
Out[158]:
In [159]:
se.District_Sector.unique()
Out[159]:
In [160]:
sf.PdDistrict.unique()
Out[160]:
In [161]:
a=se.groupby(['Summarized_Offense_Description','Month']).size().unstack()
a.plot(kind='bar', stacked=True)
plt.title('Seattle crime by month')
plt.show()
In [162]:
sf['Date']=pd.to_datetime(sf['Date'],errors='coerce')
sf.index=sf['Date']
b=sf.groupby(['Category',sf.index.month]).size().unstack()
b.plot(kind='bar', stacked=True)
plt.title('San Francisco crime by month')
plt.show()
In [163]:
from heapq import nlargest
se_count=se.groupby('Summarized_Offense_Description').count()
se_top15=se_count.nlargest(15, 'Offense_Code')
print se_top15['Offense_Code']
print se_top15.index
In [164]:
se['Time']=pd.to_datetime(se['Date_Reported'],errors='coerce')
se.index=se['Time']
data_se_top15= se[se['Summarized_Offense_Description'].isin(se_top15.index)]
c=data_se_top15.groupby(['Summarized_Offense_Description',data_se_top15.index.map(lambda t : t.hour/6)]).size().unstack()
p = c.plot(kind='bar', stacked=True)
patches, labels = p.get_legend_handles_labels()
p.legend(patches, ['0-6AM','6AM-12PM','12PM-6PM','6PM-Midnight'], loc='best')
plt.title('Seattle top 15 crime types by time of day')
plt.show()
In [165]:
data_se_evening=se[(se.index.hour<24) & (se.index.hour>18)]
se_evening_crime=data_se_evening.groupby(['Summarized_Offense_Description']).size()
p = se_evening_crime.plot(kind='bar')
patches, labels = p.get_legend_handles_labels()
p.legend(patches, ['6PM-Midnight'], loc='best')
plt.title('Seattle crime types in the evening')
plt.show()
In [166]:
from itertools import cycle, islice
my_colors=[(x/10.0, x/10, x/10) for x in range(10)]
se_top10=se_count.nlargest(10, 'Offense_Code')
data_se_top10= se[se['Summarized_Offense_Description'].isin(se_top10.index)]
c=data_se_top10.groupby(['District_Sector','Summarized_Offense_Description' ]).size().unstack()
p = c.plot(kind='bar', stacked=True,colormap='gist_rainbow')
patches, labels = p.get_legend_handles_labels()
plt.title('Seattle top 10 crime types by district')
plt.show()
In [167]:
data_se_thef=se[se['Summarized_Offense_Description'].isin(['VEHICLE THEFT','BIKE THEFT', 'ROBBERY' ])]
se_evening_crime=data_se_thef.groupby(['District_Sector']).size()
p = se_evening_crime.plot(kind='bar')
patches, labels = p.get_legend_handles_labels()
p.legend(patches, ['Theft/Robbery'], loc='best')
plt.title('Seattle theft/robbery by district')
plt.show()
In [168]:
se_byDate=data_se_top10.groupby([data_se_top10.index.date,'Summarized_Offense_Description' ]).size().unstack()
se_byDate.head(10)
Out[168]:
In [169]:
cor_crime =plt.matshow(se_byDate.corr(),cmap = plt.cm.Greens)
cb = plt.colorbar(cor_crime)
plt.title('Seattle crime type correlation on a day-to-day basis')
cb.set_label('Correlation coefficient')
In [170]:
import datetime
data_se_top10['weekday'] = data_se_top10['Time'].dt.dayofweek
se_byDay=data_se_top10.groupby(['weekday','Summarized_Offense_Description' ]).size().unstack()
se_byDay.head(10)
p = se_byDay.plot(kind='bar', stacked=True,colormap='gist_rainbow')
patches, labels = p.get_legend_handles_labels()
p.legend(patches, labels, loc=(-0.2,0))
plt.title('Seattle top 10 crime types by day of the week')
plt.show()
In [171]:
sf_count=sf.groupby('Category').count()
sf_top10=sf_count.nlargest(10, 'Date')
print sf_top10['Date']
In [172]:
data_sf_top10= sf[sf['Category'].isin(sf_top10.index)]
data_sf_top10['Time']=pd.to_datetime(data_sf_top10['Time'])
c=data_sf_top10.groupby(['Category',data_sf_top10['Time'].map(lambda t : t.hour/6)]).size().unstack()
p = c.plot(kind='bar', stacked=True)
patches, labels = p.get_legend_handles_labels()
p.legend(patches, ['0-6AM','6AM-12PM','12PM-6PM','6PM-Midnight'], loc='best')
plt.title('San Francisco top 10 crime types by time of the day')
plt.show()
In [173]:
sf.index=pd.to_datetime(sf['Time'])
data_sf_evening=sf[(sf.index.hour<24) & (sf.index.hour>18)]
sf_evening_crime=data_sf_evening.groupby(['Category']).size()
p = sf_evening_crime.plot(kind='bar')
patches, labels = p.get_legend_handles_labels()
p.legend(patches, ['6PM-Midnight'], loc='best')
plt.title('San Francisco crime in the evening')
plt.show()
In [174]:
c=data_sf_top10.groupby(['PdDistrict','Category' ]).size().unstack()
p = c.plot(kind='bar', stacked=True,colormap='gist_rainbow')
patches, labels = p.get_legend_handles_labels()
plt.title('San Francisco top 10 crime types by district')
plt.show()
In [175]:
c=data_sf_top10.groupby([data_sf_top10['Time'].map(lambda t : t.hour),'PdDistrict']).size().unstack()
c.plot()
plt.title('Crime in different SF district over time of day')
Out[175]:
In [176]:
sf_byDate=data_sf_top10.groupby([data_sf_top10.index.date,'Category' ]).size().unstack()
sf_byDate.head(10)
Out[176]:
In [177]:
cor_crime =plt.matshow(sf_byDate.corr(),cmap = plt.cm.Greens)
cb = plt.colorbar(cor_crime)
plt.title('San Francisco crime type correlation on a day-to-day basis')
cb.set_label('Correlation coefficient')
In [178]:
sf_byDay=data_sf_top10.groupby(['DayOfWeek','Category' ]).size().unstack()
p = sf_byDay.plot(kind='bar', stacked=True,colormap='gist_rainbow')
patches, labels = p.get_legend_handles_labels()
p.legend(patches, labels, loc=(-0.2,0))
plt.title('San Francisco top 10 crime types by day of the week')
plt.show()
In [179]:
se_list=['VEHICLE THEFT', 'ASSAULT', 'ROBBERY', 'PROSTITUTION', 'FRAUD', 'DISORDERLY CONDUCT', 'BURGLARY','NARCOTICS',
'WARRANT ARREST' ,'COUNTERFEIT' , 'WEAPON', 'LIQUOR VIOLATION', 'PROPERTY DAMAGE']
sf_list=['VEHICLE THEFT', 'ASSAULT', 'ROBBERY', 'PROSTITUTION', 'FRAUD', 'DISORDERLY CONDUCT', 'BURGLARY','DRUG/NARCOTIC',
'WARRANTS', 'FORGERY/COUNTERFEITING', 'WEAPON LAWS', 'LIQUOR LAWS', 'VANDALISM']
data_se_common= se[se['Summarized_Offense_Description'].isin(se_list)]
data_sf_common= sf[sf['Category'].isin(sf_list)]
c=data_se_common.groupby(['Summarized_Offense_Description']).size()
d=data_sf_common.groupby(['Category']).size()
ind = np.arange(len(se_list))
width = 0.35
p1=plt.bar(ind, c.values, width, color='r')
p2=plt.bar(ind,d.values, width, color='y',bottom=c.values)
plt.title('Total crime of the two cities')
plt.legend((p1[0], p2[0]), ('Seattle', 'San Francisco'))
plt.xticks(ind + width/2., ('Veh. Theft', 'Assault', 'Robbery', 'Pros', 'Fraud', 'Disor. Cond', 'Burglary', 'Drug',
'Warrant', 'Counterfeit', 'Weapon', 'Liquor', 'Vandalism'))
plt.show()
In [180]:
import math
se_population = 652000
sf_population = 837000
se_percap = [x / float(se_population) for x in c.values]
sf_percap = [x / float(sf_population) for x in d.values]
p1=plt.bar(ind, se_percap , width, color='r')
p2=plt.bar(ind,sf_percap , width, color='y',bottom=se_percap)
plt.title('Crime per capita of the two cities')
plt.legend((p1[0], p2[0]), ('Seattle', 'San Francisco'))
plt.xticks(ind + width/2., ('Veh. Theft', 'Assault', 'Robbery', 'Pros', 'Fraud', 'Disor. Cond', 'Burglary', 'Drug',
'Warrant', 'Counterfeit', 'Weapon', 'Liquor', 'Vandalism'))
plt.show()
In [183]:
sf.index=pd.to_datetime(sf['Date'])
sf_byDate_total=sf.groupby([sf.index.date]).size()
se_byDate_total=se.groupby([se.index.date]).size()
p2=plt.plot(sf_byDate_total.index,sf_byDate_total)
p1=plt.plot(se_byDate_total.index,se_byDate_total)
plt.title('Total crime of the two cities over the summer 2014')
plt.legend((p1[0], p2[0]), ('Seattle', 'San Francisco'))
Out[183]:
In [211]:
from mpl_toolkits.basemap import Basemap
import fiona
from itertools import chain
shp = fiona.open('seattleMap/kc_tract_10.shp')
bds = shp.bounds
shp.close()
extra = 0.0005
ll = (-122.516, 47.3096)
ur = (-122.064, 47.8616)
coords = list(chain(ll, ur))
w, h = coords[2] - coords[0], coords[3] - coords[1]
m = Basemap(
projection='tmerc',
ellps = 'WGS84',
lat_0=47.58,
lon_0=-122.29,
llcrnrlon=coords[0] - extra * w,
llcrnrlat=coords[1] - extra + extra * h,
urcrnrlon=coords[2] + extra * w,
urcrnrlat=coords[3] + extra + extra * h,
lat_ts=0,
resolution='h',
suppress_ticks=True)
# Draw coastlines, and the edges of the map.
m.drawcoastlines()
m.drawmapboundary()
se_vehTheft=se[se['Summarized_Offense_Description']=='VEHICLE THEFT']
# Convert latitude and longitude to x and y coordinates
x, y = m(list(se_vehTheft["Longitude"].astype(float)), list(se_vehTheft["Latitude"].astype(float)))
# Use matplotlib to draw the points onto the map.
m.scatter(x,y,1,marker='o',color='red')
# Show the plot.
plt.show()
In [ ]: