In [300]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import matplotlib.pyplot as plt
import seaborn as sns
import math
from matplotlib.mlab import PCA as mlabPCA
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA 
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
import seaborn as sns
import scipy.stats as stats
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score, KFold
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import RFECV
from sklearn.datasets import make_classification
from sklearn.feature_selection import RFE
from sklearn.model_selection import cross_val_predict
from sklearn import metrics
from sklearn.decomposition import PCA as sklearn_pca
import locale
from locale import atof
import warnings
from IPython.display import display
from sklearn import linear_model
from sklearn.cross_validation import cross_val_score, cross_val_predict
from sklearn.feature_selection import f_regression
import statsmodels.formula.api as smf
from statsmodels.sandbox.regression.predstd import wls_prediction_std
import xlrd

In [301]:
# Import FBI Raw Data

fbidata = pd.read_csv('https://raw.githubusercontent.com/Thinkful-Ed/data-201-resources/master/New_York_offenses/NEW_YORK-Offenses_Known_to_Law_Enforcement_by_City_2013%20-%2013tbl8ny.csv', delimiter=",", thousands=',',decimal=".")
fbiraw = pd.DataFrame(fbidata)
fbiraw.head()


Out[301]:
Table 8 Unnamed: 1 Unnamed: 2 Unnamed: 3 Unnamed: 4 Unnamed: 5 Unnamed: 6 Unnamed: 7 Unnamed: 8 Unnamed: 9 Unnamed: 10 Unnamed: 11 Unnamed: 12
0 NEW YORK NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
1 Offenses Known to Law Enforcement NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
2 by City, 2013 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
3 City Population Violent\ncrime Murder and\nnonnegligent\nmanslaughter Rape\n(revised\ndefinition)1 Rape\n(legacy\ndefinition)2 Robbery Aggravated\nassault Property\ncrime Burglary Larceny-\ntheft Motor\nvehicle\ntheft Arson3
4 Adams Village 1,861 0 0 NaN 0 0 0 12 2 10 0 0

In [302]:
#Transform FBI Raw Data
#Rename columns with row 3 from the original data set
fbiraw_t1 = fbiraw.rename(columns=fbiraw.iloc[3])

#Delete first three rows don´t contain data for the regression model
fbiraw_t2 = fbiraw_t1.drop(fbiraw_t1.index[0:4])

In [303]:
#Delete column "Rape (revised definition)1 as it contains no data
fbiraw_t2 = fbiraw_t2.drop('Rape\n(revised\ndefinition)1', axis = 1)

In [304]:
#Delete Arson Column as there is insufficient data
# 'The FBI does not publish arson data unless it receives data from either the agency or the state
#  for all 12 months of the calendar year.'
fbiraw_t2 = fbiraw_t2.drop('Arson3', axis = 1)

In [305]:
#Clean tail from the data set

#Re-shape dataset excluding the last 3 rows of the dataset as they don´t contain relevant information for the model
fbiraw_t2 = fbiraw_t2[:-3]

#Change names in Columns
fbiraw_t2= fbiraw_t2.rename(columns={'Violent\ncrime': 'Violent Crime', 'Murder and\nnonnegligent\nmanslaughter': 'Murder','Rape\n(legacy\ndefinition)2': 'Rape', 'Robbery': 'Robbery', 'Aggravated\nassault': 'Assault', 'Property\ncrime': 'PropertyCrime', 'Burglary': 'Burglary', 'Larceny-\ntheft': 'Larceny & Theft', 'Motor\nvehicle\ntheft': 'MotorVehicleTheft'})

In [306]:
#Analyse missing information
fbiraw_t2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 4 to 351
Data columns (total 11 columns):
City                 348 non-null object
Population           348 non-null object
Violent Crime        348 non-null object
Murder               348 non-null object
Rape                 348 non-null object
Robbery              348 non-null object
Assault              348 non-null object
PropertyCrime        348 non-null object
Burglary             348 non-null object
Larceny & Theft      348 non-null object
MotorVehicleTheft    348 non-null object
dtypes: object(11)
memory usage: 17.7+ KB

In [307]:
#Change all columns from object to float
locale.setlocale(locale.LC_NUMERIC, '')
fbiraw_t2['Population'] = fbiraw_t2['Population'].apply(atof)
fbiraw_t2['Violent Crime'] = fbiraw_t2['Violent Crime'].apply(atof)
fbiraw_t2['Murder'] = fbiraw_t2['Murder'].apply(atof)
fbiraw_t2['Rape'] = fbiraw_t2['Rape'].apply(atof)
fbiraw_t2['Robbery'] = fbiraw_t2['Robbery'].apply(atof)
fbiraw_t2['Assault'] = fbiraw_t2['Assault'].apply(atof)
fbiraw_t2['PropertyCrime'] = fbiraw_t2['PropertyCrime'].apply(atof)
fbiraw_t2['Burglary'] = fbiraw_t2['Burglary'].apply(atof)
fbiraw_t2['Larceny & Theft'] = fbiraw_t2['Larceny & Theft'].apply(atof)
fbiraw_t2['MotorVehicleTheft'] = fbiraw_t2['MotorVehicleTheft'].apply(atof)
fbiraw_t2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 348 entries, 4 to 351
Data columns (total 11 columns):
City                 348 non-null object
Population           348 non-null float64
Violent Crime        348 non-null float64
Murder               348 non-null float64
Rape                 348 non-null float64
Robbery              348 non-null float64
Assault              348 non-null float64
PropertyCrime        348 non-null float64
Burglary             348 non-null float64
Larceny & Theft      348 non-null float64
MotorVehicleTheft    348 non-null float64
dtypes: float64(10), object(1)
memory usage: 31.3+ KB

In [308]:
#Reindex the dataframe

fbiraw_t3 = fbiraw_t2.reset_index(drop=True)
fbiraw_t2.head()


Out[308]:
City Population Violent Crime Murder Rape Robbery Assault PropertyCrime Burglary Larceny & Theft MotorVehicleTheft
4 Adams Village 1861.0 0.0 0.0 0.0 0.0 0.0 12.0 2.0 10.0 0.0
5 Addison Town and Village 2577.0 3.0 0.0 0.0 0.0 3.0 24.0 3.0 20.0 1.0
6 Akron Village 2846.0 3.0 0.0 0.0 0.0 3.0 16.0 1.0 15.0 0.0
7 Albany 97956.0 791.0 8.0 30.0 227.0 526.0 4090.0 705.0 3243.0 142.0
8 Albion Village 6388.0 23.0 0.0 3.0 4.0 16.0 223.0 53.0 165.0 5.0

In [309]:
#Extract only the columns that are needed

fbiraw_t3 = fbiraw_t2

In [310]:
#Eliminate outliers

fbiraw_t3 = fbiraw_t3[fbiraw_t3.PropertyCrime < 450].reset_index(drop=True)

#Describe the dataset
fbiraw_t3.describe()


Out[310]:
Population Violent Crime Murder Rape Robbery Assault PropertyCrime Burglary Larceny & Theft MotorVehicleTheft
count 281.000000 281.000000 281.000000 281.000000 281.000000 281.000000 281.000000 281.000000 281.000000 281.000000
mean 7921.523132 8.231317 0.053381 0.608541 1.825623 5.743772 119.836299 19.039146 98.220641 2.576512
std 7687.278137 12.442147 0.240530 1.249199 3.596852 8.590599 115.075293 19.842286 96.809428 3.619691
min 526.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000
25% 2498.000000 1.000000 0.000000 0.000000 0.000000 1.000000 32.000000 5.000000 23.000000 0.000000
50% 5213.000000 4.000000 0.000000 0.000000 1.000000 3.000000 77.000000 13.000000 62.000000 1.000000
75% 10374.000000 11.000000 0.000000 1.000000 2.000000 7.000000 180.000000 25.000000 150.000000 3.000000
max 37438.000000 132.000000 2.000000 9.000000 34.000000 87.000000 442.000000 107.000000 410.000000 21.000000

In [311]:
#Print length of dataset and sort values by Population to see how many datapoints are excluded
print(len(fbiraw_t3), len(fbiraw_t2) - len(fbiraw_t3))
fbiraw_t3.sort_values('PropertyCrime',ascending=False).head()


281 67
Out[311]:
City Population Violent Crime Murder Rape Robbery Assault PropertyCrime Burglary Larceny & Theft MotorVehicleTheft
17 Bethlehem Town 34243.0 13.0 0.0 0.0 3.0 10.0 442.0 50.0 388.0 4.0
48 Cicero Town 29571.0 12.0 0.0 0.0 2.0 10.0 440.0 45.0 393.0 2.0
262 Ulster Town 12195.0 14.0 0.0 3.0 2.0 9.0 437.0 25.0 410.0 2.0
192 Oneida 11220.0 18.0 0.0 2.0 4.0 12.0 434.0 68.0 357.0 9.0
54 Corning 11087.0 35.0 0.0 3.0 8.0 24.0 416.0 63.0 349.0 4.0

In [312]:
#Plot the relationships between variables
sns.set_style("white")

#Conisder only the vairables suitable for the model
dfcont = fbiraw_t3

# Scatterplot matrix.
g = sns.PairGrid(dfcont, diag_sharey=False)
g.map_upper(plt.scatter, alpha=.5)

# Fit line summarizing the linear relationship of the two variables.
g.map_lower(sns.regplot, scatter_kws=dict(alpha=0))

# Give information about the univariate distributions of the variables.
g.map_diag(sns.kdeplot, lw=3)
plt.show()


d:\users\borja.gonzalez\appdata\local\programs\python\python36-32\lib\site-packages\matplotlib\axes\_axes.py:545: UserWarning: No labelled objects found. Use label='...' kwarg on individual plots.
  warnings.warn("No labelled objects found. "

In [313]:
#Create the new feature Population2

#fbiraw_t3['Population2'] = fbiraw_t3['Population']*fbiraw_t3['Population']

In [314]:
#Convert Robbery into a categorical feature

fbiraw_t3.loc[fbiraw_t3['Robbery'] > 0, 'Robbery'] = 1

In [315]:
#Convert Murder into a categorical feature

fbiraw_t3.loc[fbiraw_t3['Murder'] > 0, 'Murder'] = 1

In [316]:
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))
ax.set_xscale("log")


# Define the variables that are going to be plot
df_long = fbiraw_t3

#Boxplot vairables
ax = sns.boxplot(data=df_long, orient="h", palette="Set2")



In [317]:
fbiraw_t2.head()


Out[317]:
City Population Violent Crime Murder Rape Robbery Assault PropertyCrime Burglary Larceny & Theft MotorVehicleTheft
4 Adams Village 1861.0 0.0 0.0 0.0 0.0 0.0 12.0 2.0 10.0 0.0
5 Addison Town and Village 2577.0 3.0 0.0 0.0 0.0 3.0 24.0 3.0 20.0 1.0
6 Akron Village 2846.0 3.0 0.0 0.0 0.0 3.0 16.0 1.0 15.0 0.0
7 Albany 97956.0 791.0 8.0 30.0 227.0 526.0 4090.0 705.0 3243.0 142.0
8 Albion Village 6388.0 23.0 0.0 3.0 4.0 16.0 223.0 53.0 165.0 5.0

In [318]:
#Transform dataset into final dataset with features

fbidata = fbiraw_t3.drop('City',axis=1)

In [319]:
names = fbidata.columns
df_scaled = pd.DataFrame(preprocessing.scale(fbidata), columns = names)
df_scaled.describe()


Out[319]:
Population Violent Crime Murder Rape Robbery Assault PropertyCrime Burglary Larceny & Theft MotorVehicleTheft
count 2.810000e+02 2.810000e+02 2.810000e+02 2.810000e+02 2.810000e+02 2.810000e+02 2.810000e+02 2.810000e+02 2.810000e+02 2.810000e+02
mean 6.321555e-18 1.896466e-17 -1.896466e-17 -5.689399e-17 -1.390742e-16 -1.264311e-17 3.160777e-17 5.057244e-17 2.212544e-17 -4.425088e-17
std 1.001784e+00 1.001784e+00 1.001784e+00 1.001784e+00 1.001784e+00 1.001784e+00 1.001784e+00 1.001784e+00 1.001784e+00 1.001784e+00
min -9.637634e-01 -6.627476e-01 -2.289857e-01 -4.880140e-01 -1.010734e+00 -6.698042e-01 -1.043231e+00 -9.612357e-01 -1.016387e+00 -7.130745e-01
25% -7.067780e-01 -5.822322e-01 -2.289857e-01 -4.880140e-01 -1.010734e+00 -5.531902e-01 -7.646560e-01 -7.087990e-01 -7.783833e-01 -7.130745e-01
50% -3.529670e-01 -3.406861e-01 -2.289857e-01 -4.880140e-01 9.893802e-01 -3.199622e-01 -3.729100e-01 -3.049004e-01 -3.748112e-01 -4.363149e-01
75% 3.195998e-01 2.229216e-01 -2.289857e-01 3.139271e-01 9.893802e-01 1.464937e-01 5.237531e-01 3.009476e-01 5.358129e-01 1.172042e-01
max 3.846503e+00 9.965282e+00 4.367085e+00 6.729456e+00 9.893802e-01 9.475613e+00 2.804585e+00 4.440909e+00 3.226293e+00 5.098877e+00

In [320]:
fbidata.to_csv('fbidata.csv')

Linear regression model


In [321]:
# Instantiate and fit our model.
regr = linear_model.LinearRegression()

#Create Variables for the model by dropping variables from the initial list
Y = df_scaled['PropertyCrime'].values.ravel()
X = df_scaled.drop(['Violent Crime','Murder','Larceny & Theft','PropertyCrime', 'Rape','MotorVehicleTheft','Assault'],axis=1)
regr.fit(X, Y)

# Inspect the results.
print('\nCoefficients: \n', regr.coef_)
print('\nIntercept: \n', regr.intercept_)
print('\nR-squared:')
print(regr.score(X, Y))
print('\nVariables in the model: \n',list(X.columns))


Coefficients: 
 [ 0.28185986  0.1676387   0.6022175 ]

Intercept: 
 2.26845917801e-17

R-squared:
0.812488657335

Variables in the model: 
 ['Population', 'Robbery', 'Burglary']

In [322]:
# Initialize the figure with a logarithmic x axis
f, ax = plt.subplots(figsize=(7, 6))


# Define the variables that are going to be plot
df_long = X

#Boxplot vairables
ax = sns.boxplot(data=df_long, orient="h", palette="Set2")


Assumption 1: Linear relationship


In [323]:
# Establish the pair outcome-feature we want to compare
outcome = Y
feature = X['Burglary']

# Plot the data as-is. Looks a mite quadratic.
plt.scatter( feature,outcome)
plt.title('Relationshib between Outcome & Features')
plt.xlabel('Features')
plt.ylabel('Outcome')
plt.show()


Assumption 2: Multivariate normality

In [324]:
# Extract predicted values.
predicted = regr.predict(X).ravel()
actual = df_scaled['PropertyCrime']

# Calculate the error, also called the residual.
residual = actual - predicted

# Plot the distribution of errors.
plt.hist(residual)
plt.title('Residual counts')
plt.xlabel('Residual')
plt.ylabel('Count')
plt.show()


Assumption 3: Homoscedasticity


In [325]:
plt.scatter(predicted, residual)
plt.xlabel('Predicted')
plt.ylabel('Residual')
plt.axhline(y=0)
plt.title('Residual vs. Predicted')
plt.show()


Assumption 4: Low multicollinearity


In [326]:
# Build up the correlation mtrix
Z = X
correlation_matrix = Z.corr()
display(correlation_matrix)


Population Robbery Burglary
Population 1.000000 0.432624 0.568601
Robbery 0.432624 1.000000 0.536684
Burglary 0.568601 0.536684 1.000000

PCA & RFE


In [327]:
#Eigenvectores & Eigenvalues

eig_vals, eig_vecs = np.linalg.eig(correlation_matrix)

# Inspecting the eigenvalues and eigenvectors.
for i in range(len(eig_vals)):
    eigvecs = eig_vecs[:, i].reshape(1, len(X.columns)).T
    print('Eigenvector {}: \n{}'.format(i + 1, eigvecs))
    print('Eigenvalue {}: {}'.format(i + 1, eig_vals[i]))
    print(40 * '-')


sklearn_pca = PCA(n_components=len(X.columns))
Y_sklearn = sklearn_pca.fit_transform(correlation_matrix)

print(
    'The percentage of total variance in the dataset explained by each',
    'component from Sklearn PCA.\n',
    sklearn_pca.explained_variance_ratio_
)


Eigenvector 1: 
[[-0.56929822]
 [-0.55606052]
 [-0.60555449]]
Eigenvalue 1: 2.027376437109163
----------------------------------------
Eigenvector 2: 
[[-0.65096928]
 [ 0.75476172]
 [-0.08107866]]
Eigenvalue 2: 0.5692173321158762
----------------------------------------
Eigenvector 3: 
[[-0.50213399]
 [-0.34803943]
 [ 0.79166281]]
Eigenvalue 3: 0.40340623077496046
----------------------------------------
The percentage of total variance in the dataset explained by each component from Sklearn PCA.
 [  6.60047913e-01   3.39952087e-01   2.77469582e-32]

In [328]:
#From the Scree plot.

plt.plot(eig_vals)
plt.show()



In [329]:
# create the RFE model and select features

nfeatures = (len(X.columns))
rfe = RFE(regr,nfeatures)
fit = rfe.fit(X,Y)

# summarize the selection of the features

result_RFE = pd.DataFrame(list(zip(X.head(0), rfe.ranking_, rfe.support_)),columns=['Features','Ranking','Support'] )
result_RFE.sort_values('Ranking')


Out[329]:
Features Ranking Support
0 Population 1 True
1 Robbery 1 True
2 Burglary 1 True

In [330]:
#Result as a list for "keywords"
result= result_RFE[result_RFE.Support == False]
drop = result["Features"].tolist()
drop


Out[330]:
[]

Cross Validation & Predictive Power of the model


In [ ]:
#Initiating the cross validation generator, N splits = 10
kf = KFold(10)

In [331]:
#Cross validate the model on the folds

scores = cross_val_score(regr, X, Y, cv=kf)
print('Cross-validated scores:', scores)
print('Cross-validation average:', scores.mean())


Cross-validated scores: [ 0.91582688  0.8232567   0.83997206  0.74336812  0.62213282  0.75082512
  0.84946267  0.82764719  0.90397765  0.6330684 ]
Cross-validation average: 0.790953761621

In [332]:
#Plot Predictions vs values
predictions = cross_val_predict(regr, X, Y, cv=kf)
plt.scatter(Y, predictions)
plt.xlabel('Real Values')
plt.ylabel('Predicted Values')
plt.axhline(y=0)
plt.title('Real vs. Predicted')
plt.show()



In [333]:
#Predictive accuracy
accuracy = metrics.r2_score(Y, predictions)
print ('Cross-Predicted Accuracy:', accuracy)


Cross-Predicted Accuracy: 0.805132585817

Validating a linear regression


In [366]:
#Testing the results with statsmodel
linear_formula = 'PropertyCrime ~ Population+Robbery+Burglary'
lm = smf.ols(formula=linear_formula, data=df_scaled).fit()
print('Coefficients: \n', lm.params)
print('\n P values: \n', lm.pvalues)
print('\n R squared: \n', lm.rsquared)
print('\n Confidence Interval: \n', lm.conf_int())


Coefficients: 
 Intercept     3.122502e-17
Population    2.818599e-01
Robbery       1.676387e-01
Burglary      6.022175e-01
dtype: float64

 P values: 
 Intercept     1.000000e+00
Population    1.984592e-16
Robbery       1.898919e-07
Burglary      9.342222e-47
dtype: float64

 R squared: 
 0.812488657335

 Confidence Interval: 
                    0         1
Intercept  -0.051218  0.051218
Population  0.218520  0.345200
Robbery     0.105888  0.229389
Burglary    0.534539  0.669896

In [367]:
# Use wls_prediction_std to build confidence intervals
prstd, iv_l, iv_u = wls_prediction_std(lm)

plt.figure()
plt.plot(iv_u[0:15], 'o', color='r')
plt.plot(iv_l[0:15], 'o', color='r')
plt.plot(lm.fittedvalues[0:15], 'o', color='b')
plt.title('blue: predicted, red: 95% CI')
plt.show()


Saving the model


In [368]:
import pickle
# save the model to disk
filename = 'finalized_regr.sav'
pickle.dump(regr, open(filename, 'wb'))

In [ ]: