In this notebook, I examine the relationship between weather temperature and frequency of crimes. This analysis is based on crimes data from January,1st 2012 until June, 7th 2017. This dataset was pulled from City of Chicago Data Portal link. I also collected historical weather data from January,1st 2012 until June, 7th 2017 link.
In [1]:
#importing libraries
import pandas as pd
import numpy as np
from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns
import glob
import warnings
warnings.filterwarnings("ignore")
%matplotlib inline
In [2]:
#Compiling multiple csv files present in a folder together in one dataframe.
data = pd.DataFrame()
for f in glob.glob('./crimeData/*.csv'):
data = pd.concat([data,pd.read_csv(f)])
In [3]:
#Reading in Weather Data (average daily temperature)
temp = pd.read_csv('./climate_Data_Jan_1_2012_up_to_Jun_7_2017.csv')
temp.columns
Out[3]:
In [4]:
'''
cleanDate_W: cleans and re-formats the date column in the weather data
'''
def cleanDate_W(d):
return (datetime.strptime(str(d), '%m/%d/%y').date())
temp['Date'] = temp['Date'].apply(cleanDate_W)
temp['Date']= pd.to_datetime(temp['Date'],errors='coerce')
temp['Day']= temp['Date'].dt.day
temp['Month']= temp['Date'].dt.month
temp['Year']= temp['Date'].dt.year
In [5]:
'''
cleanDate_S: cleans and re-formats the date column in the shootings data
'''
def cleanDate_S(d):
if(d==''):
return (d)
else:
return (datetime.strptime(str(d) ,'%m/%d/%Y %H:%M:%S %p').date()) #converting dates from type string to type date and time
data['Date'] = data['Date'].apply(cleanDate_S)
data['Date'] = pd.to_datetime(data['Date'],errors='coerce')
In [6]:
#merging the shootings and temp data based on the Date column
crimeTemp = data.merge(temp,how='inner',left_on='Date',right_on='Date')
In [7]:
crimeTemp.shape
Out[7]:
In [8]:
#checking if the new column (Mean_Temp) has been added
crimeTemp.columns
Out[8]:
In [9]:
crimeTemp.head(4)
Out[9]:
In [10]:
C_count = crimeTemp [['Date','Primary Type','Mean_Temp']]
C_count ['Year'] = C_count['Date'].dt.year
C_count ['Month'] = C_count['Date'].dt.month
C_count ['Day'] = C_count['Date'].dt.day
In [11]:
plt.figure()
fig, ax = plt.subplots(figsize=(15,10))
for y in C_count['Year'].unique().tolist():
dat = C_count[C_count['Year']==y]
dat = dat[['Year','Month','Mean_Temp']].drop_duplicates()
dat = pd.DataFrame(dat.groupby(['Year','Month'])['Mean_Temp'].mean()) #get the mean temperature for each month
temps = [x[0] for x in dat.values.tolist()]
dates = dat.index.levels[1].tolist() #fixing the format of dates
ax.plot(dates,temps,label=y)
plt.xticks(C_count[C_count['Year']==2012]['Month'].unique().tolist())
plt.xlabel('Month')
plt.ylabel("Average Temperature")
plt.legend()
plt.show()
In [12]:
#number of days with 80 degrees and higher in each year
temp[(temp['Mean_Temp']>=80) & (temp['Month']>5)].groupby('Year')['Month'].count()
#temp[(temp['Mean_Temp']>=80) & (temp['Month']>5)].groupby('Year')['Month'].count().value_counts() #uncomment this line for a break down by month
Out[12]:
In [13]:
#in general, what are the most common types of crimes in the past 5 years
plt.figure(figsize=(10,10))
sns.barplot(y=C_count['Primary Type'].value_counts().index.tolist(),
x=C_count['Primary Type'].value_counts().values.tolist(),data=C_count,color='royalblue',orient='h')
Out[13]:
In [14]:
#total number of crimes during the summer
crime_peak = C_count.groupby(['Year','Month'])['Primary Type'].count().reset_index() #crime peak in the summer
crime_peak[(crime_peak['Month']>5) & (crime_peak['Month']<9)].groupby('Year')['Primary Type'].sum()
Out[14]:
In [15]:
#getting the number of crimes, irrespective of crime type, in each month for each year
crime_monthly_count = C_count.groupby([C_count['Date'].dt.year,C_count['Date'].dt.month])['Primary Type'].count()
crime_monthly_count.index.names=['Year','Month']
crime_monthly_count = crime_monthly_count.reset_index()
crime_monthly_count.head(3)
Out[15]:
In [16]:
# the drop in 2017 is due to the incomplete data..
plt.figure()
fig, ax = plt.subplots(figsize=(10,10))
for y in crime_monthly_count['Year'].unique().tolist():
dat = crime_monthly_count[crime_monthly_count['Year']==y]
ax.plot(dat['Month'], dat['Primary Type'], label = y)
plt.xticks(crime_monthly_count[crime_monthly_count['Year']==2012]['Month'].unique().tolist())
plt.xlabel('Month')
plt.ylabel("Number of Crimes")
plt.legend()
plt.show()
In [17]:
#total number of crimes per day in each month for each year
#i.e for each day, the average daily temperature was X and there were Y number of crimes.
crime_weather = C_count
crime_daily_C_Temperature = crime_weather.groupby(['Year','Month','Day'])['Primary Type'].count()
crime_daily_C_Temperature = crime_daily_C_Temperature.reset_index()
crime_daily_C_Temperature = crime_daily_C_Temperature.merge(temp,on=['Year','Month','Day'],how='inner')
crime_daily_C_Temperature
Out[17]:
In [18]:
plt.figure(figsize=(15,15))
ax = sns.lmplot(x='Mean_Temp',y='Primary Type',hue='Year',data=crime_daily_C_Temperature,scatter=False)
In [19]:
#breaking down the total number of crimes per day by crime type.
#i.e the 1462 crimes in January 1st, 2012 is now broken down to 335 incidents of thefts, 263 incidents of battery,..etc.
crimeReg = C_count.groupby(['Year','Month','Day'])['Primary Type'].value_counts()
crimeReg = pd.DataFrame(crimeReg)
crimeReg.columns=['Count']
crimeReg = crimeReg.reset_index()
crimeReg.head(3)
Out[19]:
In [20]:
crimeReg_Temperature = crimeReg.merge(temp,on=['Year','Month','Day'],how='inner')
crimeReg_Temperature.head(3)
#crimeReg.to_csv('temp_reg_analysis.csv')
Out[20]:
In [21]:
sns.lmplot(x='Mean_Temp',y='Count',hue='Primary Type',col='Primary Type',col_wrap=3,data=crimeReg_Temperature,scatter=True,fit_reg=False)
Out[21]:
In [22]:
crimeReg_Temperature.head(3)
Out[22]:
In [23]:
'''
Convert dates (%Y-%m-%d) to days (Sunday, Monday...etc.)
'''
def toDay(d):
return (datetime.strptime(str(d),'%Y-%m-%d %H:%M:%S').date().strftime('%A'))
crimeReg_Temperature['Day_Words'] = crimeReg_Temperature['Date'].apply(toDay)
In [24]:
dictDay= {'Saturday':'Weekend','Sunday':'Weekend','Friday':'Weekend','Monday':'Weekday','Tuesday':'Weekday','Wednesday':'Weekday','Thursday':'Weekday','Friday':'Weekday'}
crimeReg_Temperature['Day_Words']= crimeReg_Temperature['Day_Words'].map(dictDay)
In [ ]:
crimeReg_Temperature.head(3)
Out[ ]:
In [ ]:
'''
Green: Weekday
Blue: Weekend
Trying to find if the relationship between crime and temperature changes between weekends and weekedays.
Only Battery and Criminal Damage tend to be high on weekends as the temperature increases.
Some of the remaining types of crimes, however, show an increase on weekdays, but not on weekends.
'''
plt.figure(figsize=(15,15))
sns.lmplot('Mean_Temp','Count',hue='Day_Words',data=crimeReg_Temperature,scatter=False,col='Primary Type',col_wrap=3)
#summary:
#results below show that only batter tend to occure more frequently on warm days especially on weekends (which is similar to the shootings pattern).
#However, remaining patterns are mainly happening on weekdays