In [2458]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model

In [2459]:
# Import FBI Raw Data

fbidata = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv', delimiter=",", thousands=',',decimal=".")
fbiraw = pd.DataFrame(fbidata)
fbiraw.head()


Out[2459]:
Table 8 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12
0 NEW YORK NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Offenses Known to Law Enforcement NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 by City, 2013 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 City Population Violent\ncrime Murder and\nnonnegligent\nmanslaughter Rape\n(revised\ndefinition)1 Rape\n(legacy\ndefinition)2 Robbery Aggravated\nassault Property\ncrime Burglary Larceny-\ntheft Motor\nvehicle\ntheft Arson3
4 Adams Village 1,861 0 0 NaN 0 0 0 12 2 10 0 0

In [2460]:
#Transform FBI Raw Data
#Rename columns with row 3 from the original data set
fbiraw_t1 = fbiraw.rename(columns=fbiraw.iloc[3])

#Delete first three rows don´t contain data for the regression model
fbiraw_t2 = fbiraw_t1.drop(fbiraw_t1.index[0:4])

In [2461]:
#Delete column "Rape (revised definition)1 as it contains no data
fbiraw_t2 = fbiraw_t2.drop('Rape\n(revised\ndefinition)1', axis = 1)

In [2462]:
#Delete Arson Column as there is insufficient data
# 'The FBI does not publish arson data unless it receives data from either the agency or the state
#  for all 12 months of the calendar year.'
fbiraw_t2 = fbiraw_t2.drop('Arson3', axis = 1)

In [2463]:
#Clean tail from the data set

#Re-shape dataset excluding the last 3 rows of the dataset as they don´t contain relevant information for the model
fbiraw_t2 = fbiraw_t2[:-3]

#Change names in Columns
fbiraw_t2= fbiraw_t2.rename(columns={'Violent\ncrime': 'Violent Crime', 'Murder and\nnonnegligent\nmanslaughter': 'Murder','Rape\n(legacy\ndefinition)2': 'Rape', 'Robbery': 'Robbery', 'Aggravated\nassault': 'Assault', 'Property\ncrime': 'PropertyCrime', 'Burglary': 'Burglary', 'Larceny-\ntheft': 'Larceny & Theft', 'Motor\nvehicle\ntheft': 'Motor Vehicle Theft'})

In [2464]:
#Analyse missing information
fbiraw_t2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 4 to 351
Data columns (total 11 columns):
City                   348 non-null object
Population             348 non-null object
Violent Crime          348 non-null object
Murder                 348 non-null object
Rape                   348 non-null object
Robbery                348 non-null object
Assault                348 non-null object
PropertyCrime          348 non-null object
Burglary               348 non-null object
Larceny & Theft        348 non-null object
Motor Vehicle Theft    348 non-null object
dtypes: object(11)
memory usage: 17.7+ KB

In [2465]:
#Change all columns from object to float
locale.setlocale(locale.LC_NUMERIC, '')
fbiraw_t2['Population'] = fbiraw_t2['Population'].apply(atof)
fbiraw_t2['Violent Crime'] = fbiraw_t2['Violent Crime'].apply(atof)
fbiraw_t2['Murder'] = fbiraw_t2['Murder'].apply(atof)
fbiraw_t2['Rape'] = fbiraw_t2['Rape'].apply(atof)
fbiraw_t2['Robbery'] = fbiraw_t2['Robbery'].apply(atof)
fbiraw_t2['Assault'] = fbiraw_t2['Assault'].apply(atof)
fbiraw_t2['PropertyCrime'] = fbiraw_t2['PropertyCrime'].apply(atof)
fbiraw_t2['Burglary'] = fbiraw_t2['Burglary'].apply(atof)
fbiraw_t2['Larceny & Theft'] = fbiraw_t2['Larceny & Theft'].apply(atof)
fbiraw_t2['Motor Vehicle Theft'] = fbiraw_t2['Motor Vehicle Theft'].apply(atof)
fbiraw_t2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 4 to 351
Data columns (total 11 columns):
City                   348 non-null object
Population             348 non-null float64
Violent Crime          348 non-null float64
Murder                 348 non-null float64
Rape                   348 non-null float64
Robbery                348 non-null float64
Assault                348 non-null float64
PropertyCrime          348 non-null float64
Burglary               348 non-null float64
Larceny & Theft        348 non-null float64
Motor Vehicle Theft    348 non-null float64
dtypes: float64(10), object(1)
memory usage: 31.3+ KB

In [2466]:
#Reindex the dataframe

fbiraw_t3 = fbiraw_t2.reset_index(drop=True)

In [2467]:
#Extract only the columns that are needed

fbiraw_t3 = fbiraw_t3[['City','PropertyCrime','Population','Murder','Robbery']]

In [2468]:
#Eliminate outliers

fbiraw_t3 = fbiraw_t3[fbiraw_t3.PropertyCrime < 170].reset_index(drop=True)

#Describe the dataset
fbiraw_t3.describe()


Out[2468]:
PropertyCrime Population Murder Robbery
count 207.000000 207.000000 207.000000 207.000000
mean 60.338164 5212.091787 0.038647 0.618357
std 46.944624 4519.548980 0.193220 1.129589
min 0.000000 526.000000 0.000000 0.000000
25% 20.500000 2059.500000 0.000000 0.000000
50% 50.000000 3633.000000 0.000000 0.000000
75% 93.500000 7149.500000 0.000000 1.000000
max 168.000000 29315.000000 1.000000 7.000000

In [2469]:
#Print length of dataset and sort values by Population to see how many datapoints are excluded
print(len(fbiraw_t3), len(fbiraw_t2) - len(fbiraw_t3))
fbiraw_t3.sort_values('Population',ascending=False).head()


207 141
Out[2469]:
City PropertyCrime Population Murder Robbery
58 East Fishkill Town 165.0 29315.0 1.0 1.0
76 Glen Cove 62.0 27134.0 0.0 1.0
57 Eastchester Town 168.0 19800.0 0.0 0.0
112 Lynbrook Village 164.0 19528.0 0.0 3.0
132 New Castle Town 77.0 17864.0 0.0 0.0

In [2470]:
#Plot the relationships between variables
sns.set_style("white")

#Conisder only the vairables suitable for the model
dfcont = fbiraw_t3[['PropertyCrime','Population','Murder','Robbery']]

# Scatterplot matrix.
g = sns.PairGrid(dfcont, diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)

# Fit line summarizing the linear relationship of the two variables.
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))

# Give information about the univariate distributions of the variables.
g.map_diag(sns.kdeplot, lw=3)
plt.show()


d:\users\borja.gonzalez\appdata\local\programs\python\python36-32\lib\site-packages\matplotlib\axes\_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

In [2471]:
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_xscale("log")

# Define the variables that are going to be plot
df_long = fbiraw_t3[['PropertyCrime', 'Population']]

#Boxplot vairables
ax = sns.boxplot(data=df_long, orient="h", palette="Set2")



In [2472]:
#Create the new feature Population2

fbiraw_t3['Population2'] = fbiraw_t3['Population']*fbiraw_t3['Population']

In [2473]:
#Convert Robbery into a categorical feature

fbiraw_t3.loc[fbiraw_t3['Robbery'] > 0, 'Robbery'] = 1

In [2474]:
#Convert Murder into a categorical feature

fbiraw_t3.loc[fbiraw_t3['Murder'] > 0, 'Murder'] = 1

In [2475]:
#Transform dataset into final dataset with features

fbidata = fbiraw_t3[['PropertyCrime','Population', 'Population2','Murder','Robbery']]
fbidata.head()


Out[2475]:
PropertyCrime Population Population2 Murder Robbery
0 12.0 1861.0 3463321.0 0.0 0.0
1 24.0 2577.0 6640929.0 0.0 0.0
2 16.0 2846.0 8099716.0 0.0 0.0
3 46.0 4089.0 16719921.0 0.0 1.0
4 10.0 1781.0 3171961.0 0.0 0.0

Assumptions of linear regression


In [2476]:
# Instantiate and fit our model.
regr = linear_model.LinearRegression()
Y = fbidata['PropertyCrime'].values.reshape(-1, 1)
X = fbidata[['Population', 'Population2','Murder','Robbery']]
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))


Coefficients: 
 [[  1.01345878e-02  -2.77994883e-07   3.82089851e+01   3.24297112e+01]]

Intercept: 
 [ 7.80548467]

R-squared:
0.561885218122

Assumption 1: Linear relationship


In [2477]:
# 
outcome = fbidata['PropertyCrime']
feature = fbidata['Population']

# Plot the data as-is. Looks a mite quadratic.
plt.scatter(outcome, feature)
plt.title('Raw values')
plt.show()


Assumption 2: Multivariate normality

In [2478]:
# Extract predicted values.
predicted = regr.predict(X).ravel()
actual = fbidata['PropertyCrime']

# Calculate the error, also called the residual.
residual = actual - predicted

# This looks a bit concerning.
plt.hist(residual)
plt.title('Residual counts')
plt.xlabel('Residual')
plt.ylabel('Count')
plt.show()


Assumption 3: Homoscedasticity


In [2479]:
plt.scatter(predicted, residual)
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.axhline(y=0)
plt.title('Residual vs. Predicted')
plt.show()


Assumption 4: Low multicollinearity


In [2480]:
correlation_matrix = X.corr()
display(correlation_matrix)


Population Population2 Murder Robbery
Population 1.000000 0.915208 0.247698 0.351146
Population2 0.915208 1.000000 0.248068 0.265010
Murder 0.247698 0.248068 1.000000 0.219195
Robbery 0.351146 0.265010 0.219195 1.000000