In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from scipy.stats import pearsonr
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline
In [2]:
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
print(df.head())
df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by='date')
df['month'] = df.date.apply(lambda x: x.strftime('%Y-%m'))
print(df.head())
df = df.groupby(by='month').sum()
df.head()
Out[2]:
In [3]:
m_vals = df['cnt'].values
months = df.index.values
In [4]:
ue = pd.read_excel('unemployment_rate.xlsx', sheetname='unemploy')
ue_vals = ue.ix[:, 1:].values.flatten()
ue_vals
Out[4]:
In [5]:
m_vals = (m_vals - np.mean(m_vals)) / np.std(m_vals)
ue_vals = (ue_vals - np.mean(ue_vals)) / np.std(ue_vals)
In [6]:
len(m_vals), len(ue_vals)
Out[6]:
In [7]:
x = np.arange(len(m_vals))
X = np.arange(len(m_vals)).reshape([-1,1])
def smooth(x, y, nb):
y_smooth = np.zeros(x.shape[0])
for i in range(len(x)):
if i-nb < 0:
y_smooth[i] = np.mean(y[:i+11])
elif i+nb+1 > len(y):
y_smooth[i] = np.mean(y[i-nb:])
else:
y_smooth[i] = np.mean(y[i-nb:i+nb+1])
return y_smooth
m_smooth_avg = smooth(x, m_vals, 2)
smooth_unemploy = smooth(x, ue_vals, 1)
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_unemploy, c='orange', linewidth=3, alpha=.7, label = 'Smoothed Unemployment Rate')
plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')
plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])
coef, p_value = pearsonr(ue_vals, m_vals)
plt.ylabel('Number of Crimes Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between Unemployment Rate and Crime: ' + str(np.round(coef, 3)) +
'\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
fontsize=20)
plt.legend(fontsize = 20, loc=0)
plt.show()
In [110]:
m_vals.min()
Out[110]:
In [109]:
str(np.array([p_value]))[3:-1]
Out[109]:
In [ ]: