In [108]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style="whitegrid")
from sklearn.kernel_ridge import KernelRidge
from scipy.interpolate import UnivariateSpline as spline
%matplotlib inline
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
df['date'] = pd.to_datetime(df['date'])
df = df.sort_values(by='date')
df = df.reset_index().drop('index', 1)
In [109]:
print(df.shape)
df.head()
Out[109]:
In [110]:
df['m_d'] = df.date.apply(lambda x: x.strftime('%m-%d'))
df['d'] = df.date.apply(lambda x: x.strftime('%d'))
df.head()
Out[110]:
In [111]:
vals = df['cnt'].values
dates = df['date'].values
In [159]:
sns.hls_palette(8, l=.3, s=.7)
Out[159]:
In [165]:
special_days = ['01-01', '02-14', '04-01', '07-14', '10-31', '12-24', '12-25']
festivals = ['New Year', 'Valentines', 'April Fool', 'Independence', 'Halloween', 'Xmas Eve', 'Xmas']
colors = sns.color_palette("hls", 7)
In [113]:
df_sp = df[df.m_d.isin(special_days)]
df_sp.head()
Out[113]:
In [114]:
df_1st = df[df.d == '01']
df_1st.head()
Out[114]:
In [115]:
def smooth(x, y, nb):
y_smooth = np.zeros(x.shape[0])
for i in range(len(x)):
if i-nb < 0:
y_smooth[i] = np.mean(y[:i+11])
elif i+nb+1 > len(y):
y_smooth[i] = np.mean(y[i-nb:])
else:
y_smooth[i] = np.mean(y[i-nb:i+nb+1])
return y_smooth
In [116]:
x = df.index.values
y = np.array(vals)
days = x.reshape([-1,1])
y_smooth_avg = smooth(x, y, 10)
x_sp = df_sp.index.values
y_sp = df_sp.cnt.values
days_sp = x_sp.reshape([-1, 1])
x_1st = df_1st.index.values
y_1st = df_1st.cnt.values
days_1st = x_1st.reshape([-1, 1])
In [129]:
print(y_sp.shape, days_sp.shape, len(y_sp), len(days_sp))
In [132]:
x_sp[1]
Out[132]:
In [38]:
def smooth(x, y, nb):
y_smooth = np.zeros(x.shape[0])
for i in range(len(x)):
if i-nb < 0:
y_smooth[i] = np.mean(y[:i+11])
elif i+nb+1 > len(y):
y_smooth[i] = np.mean(y[i-nb:])
else:
y_smooth[i] = np.mean(y[i-nb:i+nb+1])
return y_smooth
In [182]:
plt.figure(figsize=(20, 10))
plt.scatter(days, vals, s=20, alpha=.5, c='skyblue', label= 'Crime by Day')
plt.plot(days, y_smooth_avg, c='steelblue', alpha=.9, linewidth=3, label='Smoothed Crime Signal')
#
plt.scatter(days_sp, y_sp, c=colors, s=80, label='Festival')
# plt.scatter(days_1st, y_1st, c='red', s=30, alpha=0.5, label='First Day of Month')
font = {'family': 'Helvetica Neue', #'serif',
# 'color': 'darkred', #'darkred',
'weight': 'normal',
'size': 14}
for idx in range(len(y_sp)):
plt.text(x_sp[idx]+5, y_sp[idx]+15, festivals[idx % len(festivals)], fontdict=font)
plt.xlim(xmin=0, xmax=len(y))
plt.ylim(ymin=300, ymax=1850)
plt.xticks(np.arange(0, len(y)+1, 365).tolist(), np.arange(2006, 2017).tolist())
plt.ylabel('number of crimes per day', fontsize = 20)
plt.xlabel('Time, Graphed by Days', fontsize = 20)
plt.title('NYC Crime with Festivals', fontsize = 30)
plt.legend(fontsize = 15, loc=0)
plt.show()
In [178]:
plt.figure(figsize=(20, 10))
plt.scatter(days, vals, s=20, alpha=.5, c='skyblue', label= 'Crime by Day')
plt.plot(days, y_smooth_avg, c='steelblue', alpha=.9, linewidth=3, label='Smoothed Crime Signal')
#
# plt.scatter(days_sp, y_sp, c='yellow', s=100, label='Festival')
plt.scatter(days_1st, y_1st, c='red', s=30, alpha=0.5, label='First Day of Month')
plt.xlim(xmin=0, xmax=len(y))
plt.ylim(ymin=300, ymax=1850)
plt.xticks(np.arange(0, len(y)+1, 365).tolist(), np.arange(2006, 2017).tolist())
plt.ylabel('number of crimes per day', fontsize = 20)
plt.xlabel('Time, Graphed by Days', fontsize = 20)
plt.title('NYC Crime with First Day of Month', fontsize = 30)
plt.legend(fontsize = 15, loc=0)
plt.show()
In [120]:
x = np.arange(len(vals))
y = vals
from scipy import interpolate
tck = interpolate.splrep(x, y, s=0)
xnew = np.arange(0, 2*np.pi, np.pi/50)
ynew = interpolate.splev(xnew, tck, der=0)
plt.figure(figsize=(25, 10))
plt.plot(x, y, 'x', xnew, ynew, xnew, np.sin(xnew), x, y, 'r')
# plt.legend(['Linear', 'Cubic Spline', 'True', 'LSQUnivariateSpline'])
plt.legend(['Cubic Spline'])
plt.show()
In [ ]: