In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from scipy.stats import pearsonr
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline
In [2]:
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
# print(df.head())
df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by='date')
df['month'] = df.date.apply(lambda x: x.strftime('%Y-%m'))
# print(df.head())
df = df.groupby(by='month').sum()
# df.head()
In [3]:
m_vals = df['cnt'].values
months = df.index.values
In [4]:
c = pd.read_excel('unemployment_rate.xlsx', sheetname='CPI', header=1)
c.head()
Out[4]:
In [5]:
c_vals = c["Medical care services"].values.flatten()
c_vals
Out[5]:
In [6]:
def normalize(vals):
return (vals - np.mean(vals)) / np.std(vals)
In [7]:
m_vals = normalize(m_vals)
c_vals = normalize(c_vals)
In [8]:
len(m_vals), len(c_vals)
Out[8]:
In [9]:
x = np.arange(len(m_vals))
X = np.arange(len(m_vals)).reshape([-1,1])
def smooth(x, y, nb):
y_smooth = np.zeros(x.shape[0])
for i in range(len(x)):
if i-nb < 0:
y_smooth[i] = np.mean(y[:i+11])
elif i+nb+1 > len(y):
y_smooth[i] = np.mean(y[i-nb:])
else:
y_smooth[i] = np.mean(y[i-nb:i+nb+1])
return y_smooth
m_smooth_avg = smooth(x, m_vals, 2)
smooth_cpi = smooth(x, c_vals, 1)
In [10]:
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_cpi, c='orange', linewidth=3, alpha=.7, label = 'Smoothed CPI-MedicalCare Rate')
plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')
plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])
coef, p_value = pearsonr(c_vals, m_vals)
plt.ylabel('Number of Crimes Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between Unemployment Rate and Crime: ' + str(np.round(coef, 3)) +
'\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
fontsize=20)
plt.legend(fontsize = 20, loc=0)
plt.show()
In [11]:
m_vals.min()
Out[11]:
In [12]:
# str(np.array([p_value]))[3:-1]
In [13]:
df_kycdMonth = pd.read_csv('kycd_monthly.csv')
df_kycdMonth.columns = ['Month', 'KYCD', 'Count']
In [20]:
df_of = pd.read_csv('kycd_OfnsDesc.csv')
# criminal_kycds = [358, 233] # Highly correlated to "Medical care commodities"
criminal_kycds = [117, 361, 107, 110, 124] # Highly correlated to "Medical care services"
arr = df_of.sort_values(by='Counts', ascending=False).KY_CD.values
In [21]:
DF = df_of[df_of.KY_CD.isin(criminal_kycds)]
DF
Out[21]:
In [22]:
dic1 = {}
for code in arr:
dic1[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)
In [23]:
coefs = []
p_values = []
for k, v in dic1.items():
plt.plot(X, v, alpha=.7, linewidth=5, label = 'Smoothed Signal'+str(k))
coef, p_value = pearsonr(c_vals, v)
coefs.append([k, coef])
coefs
Out[23]:
In [ ]:
In [24]:
dic = {}
for code in criminal_kycds:
dic[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)
In [29]:
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_cpi, c='black', linewidth=17, alpha=.3, label = 'Smoothed CPI Rate')
# plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
# plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')
for k, v in dic.items():
if k==117:
plt.plot(X, v, 'gold', alpha=.8, linewidth=10, label = 'Dangerous Drugs')
else:
plt.plot(X, v, alpha=.5, linewidth=5, label = 'Smoothed Signal'+str(k))
coef, p_value = pearsonr(c_vals, m_vals)
coefs.append(coef)
plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])
# coef, p_value = pearsonr(c_vals, m_vals)
plt.ylabel('Normalized Values Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between CPI and Crime: ' + str(np.round(coef, 3)) +
'\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
fontsize=20)
plt.legend(fontsize = 20, loc=0)
plt.show()