In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from scipy.stats import pearsonr
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline
In [4]:
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
# print(df.head())
df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by='date')
df['month'] = df.date.apply(lambda x: x.strftime('%Y-%m'))
# print(df.head())
df = df.groupby(by='month').sum()
# df.head()
In [5]:
m_vals = df['cnt'].values
months = df.index.values
In [16]:
c = pd.read_excel('unemployment_rate.xlsx', sheetname='CPI', header=1)
c.head()
Out[16]:
In [19]:
c_vals = c["All items"].values.flatten()
c_vals
Out[19]:
In [21]:
def normalize(vals):
return (vals - np.mean(vals)) / np.std(vals)
In [22]:
m_vals = normalize(m_vals)
c_vals = normalize(c_vals)
In [23]:
len(m_vals), len(c_vals)
Out[23]:
In [28]:
x = np.arange(len(m_vals))
X = np.arange(len(m_vals)).reshape([-1,1])
def smooth(x, y, nb):
y_smooth = np.zeros(x.shape[0])
for i in range(len(x)):
if i-nb < 0:
y_smooth[i] = np.mean(y[:i+11])
elif i+nb+1 > len(y):
y_smooth[i] = np.mean(y[i-nb:])
else:
y_smooth[i] = np.mean(y[i-nb:i+nb+1])
return y_smooth
m_smooth_avg = smooth(x, m_vals, 2)
smooth_cpi = smooth(x, c_vals, 1)
In [29]:
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_cpi, c='orange', linewidth=3, alpha=.7, label = 'Smoothed CPI Rate')
plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')
plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])
coef, p_value = pearsonr(c_vals, m_vals)
plt.ylabel('Number of Crimes Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between Unemployment Rate and Crime: ' + str(np.round(coef, 3)) +
'\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
fontsize=20)
plt.legend(fontsize = 20, loc=0)
plt.show()
In [25]:
m_vals.min()
Out[25]:
In [27]:
# str(np.array([p_value]))[3:-1]
In [159]:
df_kycdMonth = pd.read_csv('kycd_monthly.csv')
df_kycdMonth.columns = ['Month', 'KYCD', 'Count']
In [160]:
df_of = pd.read_csv('kycd_OfnsDesc.csv')
arr = df_of.sort_values(by='Counts', ascending=False).KY_CD.values
arr
Out[160]:
In [161]:
dic1 = {}
for code in arr:
dic1[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)
In [175]:
coefs = []
p_values = []
for k, v in dic1.items():
coef, p_value = pearsonr(c_vals, v)
coefs.append([k, coef])
coefs.sort()
coefs
Out[175]:
In [194]:
criminal_kycds = [112, 110, 107, 124]
# criminal_kycds = [117, 361, 351, 364, 104, 105]
DF = df_of[df_of.KY_CD.isin(criminal_kycds)]
DF
Out[194]:
In [ ]:
In [203]:
dic = {}
for code in criminal_kycds:
dic[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)
In [206]:
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_cpi, c='black', linewidth=17, alpha=.2, label = 'Smoothed CPI Rate')
# plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
# plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')
for k, v in dic.items():
plt.plot(X, v, alpha=.7, linewidth=5, label = 'Smoothed Signal'+str(k))
coef, p_value = pearsonr(c_vals, v)
coefs.append(coef)
plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])
# coef, p_value = pearsonr(c_vals, m_vals)
plt.ylabel('Normalized Values Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between CPI and Crime: ' + str(np.round(coef, 3)) +
'\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
fontsize=20)
plt.legend(fontsize = 8, loc=0)
plt.show()