In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from scipy.stats import pearsonr
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline

In [2]:
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
# print(df.head())

df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by='date')

df['month'] = df.date.apply(lambda x: x.strftime('%Y-%m')) 
# print(df.head())

df = df.groupby(by='month').sum()
# df.head()

In [3]:
m_vals = df['cnt'].values
months = df.index.values

In [4]:
c = pd.read_excel('unemployment_rate.xlsx', sheetname='CPI', header=1)
c.head()


Out[4]:
Month All items Food Food at home Food away from home Energy Gasoline (all types) Electricity Natural gas (piped) All items less food and energy Commodities less food and energy commodities Apparel New vehicles Medical care commodities Services less energy services Shelter Medical care services Education and communication
0 2006-01-01 0.040 0.026 0.024 0.030 0.248 0.274 0.150 0.345 0.021 0.001 -0.010 -0.004 0.038 0.029 0.026 0.041 0.027
1 2006-02-01 0.036 0.028 0.024 0.030 0.201 0.206 0.154 0.267 0.021 0.000 -0.018 -0.004 0.038 0.029 0.026 0.041 0.026
2 2006-03-01 0.034 0.026 0.022 0.031 0.173 0.170 0.149 0.219 0.021 0.003 -0.012 -0.002 0.041 0.028 0.025 0.041 0.026
3 2006-04-01 0.035 0.018 0.009 0.031 0.178 0.215 0.148 0.110 0.023 0.004 -0.002 -0.003 0.043 0.031 0.028 0.041 0.026
4 2006-05-01 0.042 0.019 0.008 0.032 0.236 0.334 0.130 0.100 0.024 0.003 0.000 -0.007 0.043 0.033 0.032 0.041 0.027

In [5]:
c_vals = c["Medical care services"].values.flatten()
c_vals


Out[5]:
array([ 0.041,  0.041,  0.041,  0.041,  0.041,  0.042,  0.04 ,  0.044,
        0.044,  0.042,  0.041,  0.041,  0.049,  0.051,  0.049,  0.05 ,
        0.05 ,  0.05 ,  0.054,  0.055,  0.056,  0.059,  0.058,  0.059,
        0.057,  0.05 ,  0.048,  0.047,  0.047,  0.046,  0.041,  0.039,
        0.038,  0.034,  0.031,  0.03 ,  0.03 ,  0.031,  0.031,  0.033,
        0.032,  0.031,  0.032,  0.032,  0.033,  0.032,  0.035,  0.034,
        0.035,  0.037,  0.038,  0.037,  0.034,  0.035,  0.032,  0.032,
        0.037,  0.036,  0.034,  0.034,  0.03 ,  0.03 ,  0.027,  0.028,
        0.03 ,  0.029,  0.032,  0.033,  0.028,  0.031,  0.035,  0.036,
        0.037,  0.034,  0.035,  0.037,  0.039,  0.043,  0.044,  0.042,
        0.044,  0.039,  0.037,  0.037,  0.036,  0.039,  0.039,  0.034,
        0.029,  0.028,  0.026,  0.031,  0.031,  0.029,  0.026,  0.025,
        0.025,  0.024,  0.024,  0.027,  0.03 ,  0.026,  0.025,  0.019,
        0.017,  0.019,  0.023,  0.024,  0.023,  0.018,  0.019,  0.026,
        0.025,  0.023,  0.023,  0.022,  0.024,  0.03 ,  0.031,  0.029])

In [6]:
def normalize(vals):
    return (vals - np.mean(vals)) / np.std(vals)

In [7]:
m_vals = normalize(m_vals)
c_vals = normalize(c_vals)

In [8]:
len(m_vals), len(c_vals)


Out[8]:
(120, 120)

In [9]:
x = np.arange(len(m_vals))
X = np.arange(len(m_vals)).reshape([-1,1])

def smooth(x, y, nb):
    y_smooth = np.zeros(x.shape[0])
    for i in range(len(x)):
        if i-nb < 0:
            y_smooth[i] = np.mean(y[:i+11])
        elif i+nb+1 > len(y):
            y_smooth[i] = np.mean(y[i-nb:])
        else:
            y_smooth[i] = np.mean(y[i-nb:i+nb+1])
    return y_smooth
            
            
m_smooth_avg = smooth(x, m_vals, 2)
smooth_cpi = smooth(x, c_vals, 1)

In [10]:
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_cpi, c='orange', linewidth=3, alpha=.7, label = 'Smoothed CPI-MedicalCare Rate')

plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')

plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')

plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])

coef, p_value = pearsonr(c_vals, m_vals)

plt.ylabel('Number of Crimes Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between Unemployment Rate and Crime: ' + str(np.round(coef, 3)) + 
         '\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
          fontsize=20)
plt.legend(fontsize = 20, loc=0)

plt.show()



In [11]:
m_vals.min()


Out[11]:
-2.7405656413578292

In [12]:
# str(np.array([p_value]))[3:-1]

CPI and Highly Correlated Crime Types


In [13]:
df_kycdMonth = pd.read_csv('kycd_monthly.csv')
df_kycdMonth.columns = ['Month', 'KYCD', 'Count']

In [20]:
df_of = pd.read_csv('kycd_OfnsDesc.csv')
# criminal_kycds = [358, 233]  # Highly correlated to "Medical care commodities"
criminal_kycds = [117, 361, 107, 110, 124] # Highly correlated to "Medical care services"
arr = df_of.sort_values(by='Counts', ascending=False).KY_CD.values

In [21]:
DF = df_of[df_of.KY_CD.isin(criminal_kycds)]
DF


Out[21]:
KY_CD OFNS_DESC Counts
8 107 NaN 1
9 107 BURGLARY 191406
12 110 GRAND LARCENY OF MOTOR VEHICLE 102061
23 117 NaN 4
24 117 DANGEROUS DRUGS 62679
33 124 NaN 1
34 124 KIDNAPPING 2
35 124 KIDNAPPING & RELATED OFFENSES 2300
36 124 KIDNAPPING AND RELATED OFFENSES 2
93 361 NaN 37
94 361 OFF. AGNST PUB ORD SENSBLTY & 283065

In [22]:
dic1 = {}
for code in arr:
    dic1[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)


/Users/Viola/anaconda/lib/python3.5/site-packages/numpy/core/fromnumeric.py:2889: RuntimeWarning: Mean of empty slice.
  out=out, **kwargs)
/Users/Viola/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py:80: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/Users/Viola/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py:135: RuntimeWarning: Degrees of freedom <= 0 for slice
  keepdims=keepdims)
/Users/Viola/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py:105: RuntimeWarning: invalid value encountered in true_divide
  arrmean, rcount, out=arrmean, casting='unsafe', subok=False)
/Users/Viola/anaconda/lib/python3.5/site-packages/numpy/core/_methods.py:127: RuntimeWarning: invalid value encountered in double_scalars
  ret = ret.dtype.type(ret / rcount)
/Users/Viola/anaconda/lib/python3.5/site-packages/ipykernel/__main__.py:2: RuntimeWarning: invalid value encountered in true_divide
  from ipykernel import kernelapp as app

In [23]:
coefs = []
p_values = []

for k, v in dic1.items():
    plt.plot(X, v, alpha=.7, linewidth=5, label = 'Smoothed Signal'+str(k))
    coef, p_value = pearsonr(c_vals, v)
    coefs.append([k, coef])

coefs


Out[23]:
[[230, nan],
 [881, nan],
 [235, -0.04974919411657576],
 [357, nan],
 [577, nan],
 [103, nan],
 [236, -0.073056004478535913],
 [109, -0.27120098815823412],
 [110, 0.6055474995844613],
 [363, nan],
 [237, nan],
 [359, -0.42399137868690029],
 [676, nan],
 [677, nan],
 [678, -0.47198199428778542],
 [113, 0.16348326471846866],
 [102, nan],
 [685, nan],
 [234, nan],
 [571, nan],
 [572, nan],
 [232, 0.066645487156064429],
 [672, nan],
 [578, -0.43942338536962083],
 [455, nan],
 [362, nan],
 [366, nan],
 [119, nan],
 [460, nan],
 [361, 0.63572119547037753],
 [675, 0.22497713078609119],
 [340, -0.68390184789165842],
 [341, -0.429201995463526],
 [342, nan],
 [343, -0.67405560686186783],
 [344, -0.57715099854292662],
 [345, nan],
 [346, nan],
 [347, 0.09563445933548638],
 [348, -0.65951729630284384],
 [349, nan],
 [350, -0.15621261793031396],
 [351, 0.45811477956757679],
 [352, 0.18881044359240992],
 [353, -0.53461452068949122],
 [354, nan],
 [355, -0.3240486347850931],
 [356, nan],
 [101, nan],
 [358, 0.0301059390308119],
 [231, -0.58944729281503383],
 [360, 0.098977888598121608],
 [233, -0.20690711183972632],
 [106, -0.57873046200158051],
 [107, 0.54182997558688095],
 [364, 0.24928002575068831],
 [365, -0.56839498530076682],
 [238, -0.10653336882783737],
 [111, -0.038153398013812782],
 [112, 0.35844321106203375],
 [104, 0.12150824270437226],
 [114, -0.512721440570164],
 [115, nan],
 [116, -0.0826939809536055],
 [117, 0.74939077937255882],
 [118, -0.41628436310932287],
 [105, 0.09230954454652826],
 [120, nan],
 [121, -0.41539942663238011],
 [122, nan],
 [123, nan],
 [124, 0.5855079611325239],
 [125, -0.51310116434400521],
 [126, -0.68841841423846273]]

In [ ]:


In [24]:
dic = {}
for code in criminal_kycds:
    dic[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)

In [29]:
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_cpi, c='black', linewidth=17, alpha=.3, label = 'Smoothed CPI Rate')

# plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
# plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')

for k, v in dic.items():
    if k==117:
        plt.plot(X, v, 'gold', alpha=.8, linewidth=10, label = 'Dangerous Drugs')
    else:
        plt.plot(X, v, alpha=.5, linewidth=5, label = 'Smoothed Signal'+str(k))
    coef, p_value = pearsonr(c_vals, m_vals)
    coefs.append(coef)


plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])

# coef, p_value = pearsonr(c_vals, m_vals)

plt.ylabel('Normalized Values Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between CPI and Crime: ' + str(np.round(coef, 3)) + 
         '\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
          fontsize=20)
plt.legend(fontsize = 20, loc=0)

plt.show()


About Statistical correlations of criminal behaviour