notebook.community

Edit and run



In [1]:

    
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
import datetime
from scipy.stats import pearsonr
from sklearn.kernel_ridge import KernelRidge
import seaborn as sns
sns.set(style="whitegrid")
%matplotlib inline



In [2]:

    
df = pd.read_csv('ttl_daily.csv', names=['date', 'cnt']).ix[1:,:]
print(df.head())

df['date'] = pd.to_datetime(df['date'])
# df = df.sort_values(by='date')

df['month'] = df.date.apply(lambda x: x.strftime('%Y-%m')) 
print(df.head())

df = df.groupby(by='month').sum()
df.head()









    



         date   cnt
1  01/01/2006  1485
2  01/01/2007  1636
3  01/01/2008  1631
4  01/01/2009  1547
5  01/01/2010  1757
        date   cnt    month
1 2006-01-01  1485  2006-01
2 2007-01-01  1636  2007-01
3 2008-01-01  1631  2008-01
4 2009-01-01  1547  2009-01
5 2010-01-01  1757  2010-01






    Out[2]:






  
    
      
      cnt
    
    
      month
      
    
  
  
    
      2006-01
      28335
    
    
      2006-02
      24071
    
    
      2006-03
      28543
    
    
      2006-04
      27803
    
    
      2006-05
      30058



In [3]:

    
m_vals = df['cnt'].values
months = df.index.values



In [4]:

    
ue = pd.read_excel('unemployment_rate.xlsx', sheetname='unemploy')
ue_vals = ue.ix[:, 1:].values.flatten()
ue_vals









    Out[4]:





array([ 4.8,  4.7,  4.7,  4.7,  4.7,  4.6,  4.6,  4.5,  4.4,  4.3,  4.3,
        4.3,  4.3,  4.3,  4.4,  4.4,  4.5,  4.6,  4.7,  4.7,  4.8,  4.8,
        4.8,  4.9,  4.9,  4.9,  5. ,  5.1,  5.2,  5.3,  5.4,  5.6,  5.8,
        6.1,  6.4,  6.8,  7.2,  7.6,  8. ,  8.2,  8.4,  8.6,  8.7,  8.8,
        8.8,  8.9,  8.9,  8.9,  8.8,  8.8,  8.7,  8.7,  8.6,  8.5,  8.5,
        8.5,  8.5,  8.5,  8.4,  8.4,  8.3,  8.2,  8.1,  8.1,  8.1,  8.2,
        8.2,  8.3,  8.4,  8.4,  8.5,  8.5,  8.6,  8.6,  8.7,  8.7,  8.7,
        8.7,  8.6,  8.4,  8.3,  8.2,  8.2,  8.1,  8. ,  7.9,  7.8,  7.7,
        7.7,  7.6,  7.6,  7.6,  7.5,  7.3,  7.2,  7. ,  6.9,  6.8,  6.7,
        6.6,  6.5,  6.3,  6.2,  6.1,  6. ,  5.9,  5.9,  5.8,  5.7,  5.7,
        5.6,  5.5,  5.4,  5.3,  5.2,  5.1,  5. ,  5. ,  4.9,  4.9])



In [5]:

    
def normalize(vals):
    return (vals - np.mean(vals)) / np.std(vals)



In [6]:

    
m_vals = normalize(m_vals)
ue_vals = normalize(ue_vals)



In [7]:

    
len(m_vals), len(ue_vals)









    Out[7]:





(120, 120)



In [8]:

    
x = np.arange(len(m_vals))
X = np.arange(len(m_vals)).reshape([-1,1])

def smooth(x, y, nb):
    y_smooth = np.zeros(x.shape[0])
    for i in range(len(x)):
        if i-nb < 0:
            y_smooth[i] = np.mean(y[:i+11])
        elif i+nb+1 > len(y):
            y_smooth[i] = np.mean(y[i-nb:])
        else:
            y_smooth[i] = np.mean(y[i-nb:i+nb+1])
    return y_smooth
            
            
m_smooth_avg = smooth(x, m_vals, 2)
smooth_unemploy = smooth(x, ue_vals, 1)



In [9]:

    
m_vals.min()









    Out[9]:





-2.7405656413578292



In [10]:

    
# str(np.array([p_value]))[3:-1]

Unemployment and Highly Correlated Crime Types



In [11]:

    
df_kycdMonth = pd.read_csv('kycd_monthly.csv')
df_kycdMonth.columns = ['Month', 'KYCD', 'Count']



In [12]:

    
df_of = pd.read_csv('kycd_OfnsDesc.csv')
criminal_kycds = [236, 232, 358]  # Best
labels = {236: 'DANGEROUS WEAPONS', 232: 'POSSESSION OF STOLEN PROPERTY', 358: 'OFFENSES INVOLVING FRAUD'}
# criminal_kycds = [361, 118, 359]  # Second Best
# df_of.sort_values(by='Counts', ascending=False).head(30)



In [19]:

    
df_of.sort_values(by='Counts', ascending=False).head(9).KY_CD.values









    Out[19]:





array([341, 578, 344, 351, 109, 235, 361, 105, 107])



In [20]:

    
df_of[df_of.KY_CD.isin([341, 578, 344, 351, 109, 235, 361, 105, 107])]









    Out[20]:






  
    
      
      KY_CD
      OFNS_DESC
      Counts
    
  
  
    
      4
      105
      NaN
      2
    
    
      5
      105
      ROBBERY
      198772
    
    
      8
      107
      NaN
      1
    
    
      9
      107
      BURGLARY
      191406
    
    
      10
      109
      NaN
      9
    
    
      11
      109
      GRAND LARCENY
      429196
    
    
      49
      235
      NaN
      11
    
    
      50
      235
      DANGEROUS DRUGS
      285790
    
    
      58
      341
      NaN
      10
    
    
      59
      341
      PETIT LARCENY
      822498
    
    
      64
      344
      NaN
      73
    
    
      65
      344
      ASSAULT 3 & RELATED OFFENSES
      521538
    
    
      76
      351
      NaN
      10
    
    
      77
      351
      CRIMINAL MISCHIEF & RELATED OF
      433358
    
    
      93
      361
      NaN
      37
    
    
      94
      361
      OFF. AGNST PUB ORD SENSBLTY &
      283065
    
    
      110
      578
      NaN
      71
    
    
      111
      578
      HARRASSMENT 2
      604070



In [13]:

    
DF = df_of[df_of.KY_CD.isin(criminal_kycds)]
DF









    Out[13]:






  
    
      
      KY_CD
      OFNS_DESC
      Counts
    
  
  
    
      44
      232
      NaN
      1
    
    
      45
      232
      POSSESSION OF STOLEN PROPERTY
      20376
    
    
      51
      236
      NaN
      8
    
    
      52
      236
      DANGEROUS WEAPONS
      73672
    
    
      87
      358
      NaN
      2
    
    
      88
      358
      OFFENSES INVOLVING FRAUD
      17737



In [14]:

    
dic = {}
for code in criminal_kycds:
    dic[code] = smooth(x, normalize(df_kycdMonth[df_kycdMonth.KYCD==code].Count.values), 1)



In [23]:

    
plt.figure(figsize=(20, 10))
plt.plot(X, smooth_unemploy, c='black', linewidth=20, alpha=.2, label = 'Smoothed Unemployment Rate')

# plt.scatter(X, m_vals, s=100, alpha=.5, c='steelblue', label = 'Monthly Crime Incidents')
# plt.plot(X, m_smooth_avg, c='skyblue', alpha=.7, linewidth=4, label = 'Smoothed Crime Signal')

for k, v in dic.items():
    plt.plot(X, v, alpha=.7, linewidth=5, label = 'Smoothed ' + labels[k].capitalize() + ' Signal')


plt.xlim(xmin=0, xmax=len(m_vals))
plt.ylim(ymin=-4, ymax=4)
plt.xticks(np.arange(0, 121, 12).tolist(), np.arange(2006, 2017).tolist())
plt.yticks([])

coef, p_value = pearsonr(ue_vals, m_vals)

plt.ylabel('Normalized Values Per Month', fontsize = 20)
plt.xlabel('Time, Graphed by Months', fontsize = 20)
# plt.title('NYC Crime Over Time', fontsize = 30)
plt.title('Coefficient of Correlation Between Unemployment Rate and Crime: ' + str(np.round(coef, 3)) + 
         '\n p value representing percent chance that this occurred by chance: ' + str(np.array([p_value]))[3:-1],
          fontsize=20)
plt.legend(fontsize = 20, loc=0)

plt.show()

About Statistical correlations of criminal behaviour

https://en.wikipedia.org/wiki/Statistical_correlations_of_criminal_behaviour

	cnt
month
2006-01	28335
2006-02	24071
2006-03	28543
2006-04	27803
2006-05	30058

	KY_CD	OFNS_DESC	Counts
4	105	NaN	2
5	105	ROBBERY	198772
8	107	NaN	1
9	107	BURGLARY	191406
10	109	NaN	9
11	109	GRAND LARCENY	429196
49	235	NaN	11
50	235	DANGEROUS DRUGS	285790
58	341	NaN	10
59	341	PETIT LARCENY	822498
64	344	NaN	73
65	344	ASSAULT 3 & RELATED OFFENSES	521538
76	351	NaN	10
77	351	CRIMINAL MISCHIEF & RELATED OF	433358
93	361	NaN	37
94	361	OFF. AGNST PUB ORD SENSBLTY &	283065
110	578	NaN	71
111	578	HARRASSMENT 2	604070

	KY_CD	OFNS_DESC	Counts
44	232	NaN	1
45	232	POSSESSION OF STOLEN PROPERTY	20376
51	236	NaN	8
52	236	DANGEROUS WEAPONS	73672
87	358	NaN	2
88	358	OFFENSES INVOLVING FRAUD	17737