In [1]:
import numpy as np
import pandas as pd
import matplotlib
from scipy.stats import pearsonr
import matplotlib.pyplot as plt
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline
np.set_printoptions(precision=3)
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
print(df.head())
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')
In [8]:
import sys
print(sys.version)
In [2]:
df = df[df['date'].apply( lambda x: x.year ) >= 2009]
vals = df['cnt'].values
dates = df['date'].values
w = pd.read_csv('central_park_weather.csv')
w.head()
for idx in range(w.shape[0]):
fs = str(w['DATE'][idx])
w.ix[idx,'DATE'] = fs[-2:] + '/' + fs[4:6] + '/' + fs[:4]
w['DATE'] = pd.to_datetime(w['DATE'])
w = w.sort_values(by='DATE')
w = w[w['DATE'].apply( lambda x: x.year ) <= 2015]
w.head()
Out[2]:
In [3]:
w_vals = w['TMAX'].values*100
w_vals
Out[3]:
In [4]:
w_vals = (w_vals - np.mean(w_vals)) / np.std(w_vals)
In [5]:
vals = (vals - np.mean(vals)) / np.std(vals)
In [6]:
len(vals)
Out[6]:
In [7]:
x = np.arange(len(vals))
nb = 10
days = np.arange(len(vals)).reshape([-1,1])
def smooth(x, y, nb):
y_smooth = np.zeros(x.shape[0])
for i in range(len(x)):
if i-nb < 0:
y_smooth[i] = np.mean(y[:i+11])
elif i+nb+1 > len(y):
y_smooth[i] = np.mean(y[i-nb:])
else:
y_smooth[i] = np.mean(y[i-nb:i+nb+1])
return y_smooth
y_smooth_avg = smooth(x, vals, 10)
smooth_weather = smooth(x, w_vals, 10)
plt.figure(figsize=(20, 10))
plt.plot(days, smooth_weather, c='green', linewidth=3, alpha=.7, label = 'Smoothed Max Temperature')
clf = KernelRidge(alpha=1.0,kernel='rbf')
days = np.arange(len(vals)).reshape([-1,1])
clf.fit(days,vals)
smoothed_values = clf.predict(days)
plt.scatter(days, vals, s=20, alpha=.5, c='skyblue', label = 'Daily Crime Incidents')
plt.plot(days, y_smooth_avg, c='red', alpha=.7, linewidth=3, label = 'Smoothed Crime Signal')
plt.xlim(xmin=0, xmax=len(vals))
plt.ylim(ymin=-6, ymax=7)
plt.xticks(np.arange(0, len(vals)+1, 365).tolist(), np.arange(2009, 2017).tolist())
coef, p_value = pearsonr(w_vals, vals)
plt.title('Coefficient of Correlation between weather and crime: ' + str(np.round(coef, 3)) +
'\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
fontsize=20)
plt.legend(fontsize = 20)
plt.yticks([])
plt.ylabel('Normalized Signal Value', fontsize = 20)
plt.show()
In [15]:
vals.min()
Out[15]:
In [24]:
str(np.array([p_value]))[3:-1]
Out[24]:
In [ ]: