In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from scipy.stats import pearsonr
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline

In [2]:
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
print(df.head())

df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by='date')

df['month'] = df.date.apply(lambda x: x.strftime('%Y-%m')) 
print(df.head())

df = df.groupby(by='month').sum()
df.head()


         date   cnt
1  01/01/2006  1485
2  01/01/2007  1636
3  01/01/2008  1631
4  01/01/2009  1547
5  01/01/2010  1757
        date   cnt    month
1 2006-01-01  1485  2006-01
2 2007-01-01  1636  2007-01
3 2008-01-01  1631  2008-01
4 2009-01-01  1547  2009-01
5 2010-01-01  1757  2010-01
Out[2]:
cnt
month
2006-01 28335
2006-02 24071
2006-03 28543
2006-04 27803
2006-05 30058

In [3]:
m_vals = df['cnt'].values
months = df.index.values

In [4]:
ue = pd.read_excel('unemployment_rate.xlsx', sheetname='unemploy')
ue_vals = ue.ix[:, 1:].values.flatten()
ue_vals


Out[4]:
array([ 4.8,  4.7,  4.7,  4.7,  4.7,  4.6,  4.6,  4.5,  4.4,  4.3,  4.3,
        4.3,  4.3,  4.3,  4.4,  4.4,  4.5,  4.6,  4.7,  4.7,  4.8,  4.8,
        4.8,  4.9,  4.9,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.6,  5.8,
        6.1,  6.4,  6.8,  7.2,  7.6,  8. ,  8.2,  8.4,  8.6,  8.7,  8.8,
        8.8,  8.9,  8.9,  8.9,  8.8,  8.8,  8.7,  8.7,  8.6,  8.5,  8.5,
        8.5,  8.5,  8.5,  8.4,  8.4,  8.3,  8.2,  8.1,  8.1,  8.1,  8.2,
        8.2,  8.3,  8.4,  8.4,  8.5,  8.5,  8.6,  8.6,  8.7,  8.7,  8.7,
        8.7,  8.6,  8.4,  8.3,  8.2,  8.2,  8.1,  8. ,  7.9,  7.8,  7.7,
        7.7,  7.6,  7.6,  7.6,  7.5,  7.3,  7.2,  7. ,  6.9,  6.8,  6.7,
        6.6,  6.5,  6.3,  6.2,  6.1,  6. ,  5.9,  5.9,  5.8,  5.7,  5.7,
        5.6,  5.5,  5.4,  5.3,  5.2,  5.1,  5. ,  5. ,  4.9,  4.9])

In [5]:
m_vals = (m_vals - np.mean(m_vals)) / np.std(m_vals)
ue_vals = (ue_vals - np.mean(ue_vals)) / np.std(ue_vals)

In [6]:
len(m_vals), len(ue_vals)


Out[6]:
(120, 120)

In [7]:
x = np.arange(len(m_vals))
X = np.arange(len(m_vals)).reshape([-1,1])

def smooth(x, y, nb):
    y_smooth = np.zeros(x.shape[0])
    for i in range(len(x)):
        if i-nb < 0:
            y_smooth[i] = np.mean(y[:i+11])
        elif i+nb+1 > len(y):
            y_smooth[i] = np.mean(y[i-nb:])
        else:
            y_smooth[i] = np.mean(y[i-nb:i+nb+1])
    return y_smooth
            
            
m_smooth_avg = smooth(x, m_vals, 2)
smooth_unemploy = smooth(x, ue_vals, 1)


plt.figure(figsize=(20, 10))
plt.plot(X, smooth_unemploy, c='orange', linewidth=3, alpha=.7, label = 'Smoothed Unemployment Rate')

plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')

plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')

plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])

coef, p_value = pearsonr(ue_vals, m_vals)

plt.ylabel('Number of Crimes Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between Unemployment Rate and Crime: ' + str(np.round(coef, 3)) + 
         '\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
          fontsize=20)
plt.legend(fontsize = 20, loc=0)

plt.show()


---------------------------------------------------------------------------
ValueError                                Traceback (most recent call last)
<ipython-input-7-04c41262848d> in <module>()
     19 
     20 plt.figure(figsize=(20, 10))
---> 21 plt.plot(X, smooth_unemploy, c='orange', linewidth=3, alpha=.7, label = 'Smoothed Unemployment Rate')
     22 
     23 plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')

/Users/Viola/anaconda/lib/python3.5/site-packages/matplotlib/pyplot.py in plot(*args, **kwargs)
   3159         ax.hold(hold)
   3160     try:
-> 3161         ret = ax.plot(*args, **kwargs)
   3162     finally:
   3163         ax.hold(washold)

/Users/Viola/anaconda/lib/python3.5/site-packages/matplotlib/__init__.py in inner(ax, *args, **kwargs)
   1817                     warnings.warn(msg % (label_namer, func.__name__),
   1818                                   RuntimeWarning, stacklevel=2)
-> 1819             return func(ax, *args, **kwargs)
   1820         pre_doc = inner.__doc__
   1821         if pre_doc is None:

/Users/Viola/anaconda/lib/python3.5/site-packages/matplotlib/axes/_axes.py in plot(self, *args, **kwargs)
   1381 
   1382         for line in self._get_lines(*args, **kwargs):
-> 1383             self.add_line(line)
   1384             lines.append(line)
   1385 

/Users/Viola/anaconda/lib/python3.5/site-packages/matplotlib/axes/_base.py in add_line(self, line)
   1701             line.set_clip_path(self.patch)
   1702 
-> 1703         self._update_line_limits(line)
   1704         if not line.get_label():
   1705             line.set_label('_line%d' % len(self.lines))

/Users/Viola/anaconda/lib/python3.5/site-packages/matplotlib/axes/_base.py in _update_line_limits(self, line)
   1723         Figures out the data limit of the given line, updating self.dataLim.
   1724         """
-> 1725         path = line.get_path()
   1726         if path.vertices.size == 0:
   1727             return

/Users/Viola/anaconda/lib/python3.5/site-packages/matplotlib/lines.py in get_path(self)
    936         """
    937         if self._invalidy or self._invalidx:
--> 938             self.recache()
    939         return self._path
    940 

/Users/Viola/anaconda/lib/python3.5/site-packages/matplotlib/lines.py in recache(self, always)
    623                 x = ma.asarray(xconv, np.float_).filled(np.nan)
    624             else:
--> 625                 x = np.asarray(xconv, np.float_)
    626             x = x.ravel()
    627         else:

/Users/Viola/anaconda/lib/python3.5/site-packages/numpy/core/numeric.py in asarray(a, dtype, order)
    529 
    530     """
--> 531     return array(a, dtype, copy=False, order=order)
    532 
    533 

ValueError: could not convert string to float: '2015-12'

In [110]:
m_vals.min()


Out[110]:
-2.7405656413578292

In [109]:
str(np.array([p_value]))[3:-1]


Out[109]:
'.40642759'

In [ ]: