In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from scipy.stats import pearsonr
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline
In [2]:
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
print(df.head())
df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by='date')
df['month'] = df.date.apply(lambda x: x.strftime('%Y-%m'))
print(df.head())
df = df.groupby(by='month').sum()
df.head()
Out[2]:
In [3]:
m_vals = df['cnt'].values
months = df.index.values
In [4]:
ue = pd.read_excel('unemployment_rate.xlsx', sheetname='unemploy')
ue_vals = ue.ix[:, 1:].values.flatten()
ue_vals
Out[4]:
In [5]:
def normalize(vals):
return (vals - np.mean(vals)) / np.std(vals)
In [6]:
m_vals = normalize(m_vals)
ue_vals = normalize(ue_vals)
In [7]:
len(m_vals), len(ue_vals)
Out[7]:
In [8]:
x = np.arange(len(m_vals))
X = np.arange(len(m_vals)).reshape([-1,1])
def smooth(x, y, nb):
y_smooth = np.zeros(x.shape[0])
for i in range(len(x)):
if i-nb < 0:
y_smooth[i] = np.mean(y[:i+11])
elif i+nb+1 > len(y):
y_smooth[i] = np.mean(y[i-nb:])
else:
y_smooth[i] = np.mean(y[i-nb:i+nb+1])
return y_smooth
m_smooth_avg = smooth(x, m_vals, 2)
smooth_unemploy = smooth(x, ue_vals, 1)
In [9]:
m_vals.min()
Out[9]:
In [10]:
# str(np.array([p_value]))[3:-1]
In [11]:
df_kycdMonth = pd.read_csv('kycd_monthly.csv')
df_kycdMonth.columns = ['Month', 'KYCD', 'Count']
In [12]:
df_of = pd.read_csv('kycd_OfnsDesc.csv')
criminal_kycds = [236, 232, 358] # Best
labels = {236: 'DANGEROUS WEAPONS', 232: 'POSSESSION OF STOLEN PROPERTY', 358: 'OFFENSES INVOLVING FRAUD'}
# criminal_kycds = [361, 118, 359] # Second Best
# df_of.sort_values(by='Counts', ascending=False).head(30)
In [19]:
df_of.sort_values(by='Counts', ascending=False).head(9).KY_CD.values
Out[19]:
In [20]:
df_of[df_of.KY_CD.isin([341, 578, 344, 351, 109, 235, 361, 105, 107])]
Out[20]:
In [13]:
DF = df_of[df_of.KY_CD.isin(criminal_kycds)]
DF
Out[13]:
In [14]:
dic = {}
for code in criminal_kycds:
dic[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)
In [23]:
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_unemploy, c='black', linewidth=20, alpha=.2, label = 'Smoothed Unemployment Rate')
# plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
# plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')
for k, v in dic.items():
plt.plot(X, v, alpha=.7, linewidth=5, label = 'Smoothed ' + labels[k].capitalize() + ' Signal')
plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])
coef, p_value = pearsonr(ue_vals, m_vals)
plt.ylabel('Normalized Values Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between Unemployment Rate and Crime: ' + str(np.round(coef, 3)) +
'\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
fontsize=20)
plt.legend(fontsize = 20, loc=0)
plt.show()