In [1]:
%matplotlib inline

import os
import requests
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import csv
import sys

from pandas.tools.plotting import scatter_matrix

from sklearn import cross_validation as cv
from sklearn.cross_validation import train_test_split as tts
from sklearn.linear_model import Ridge
from sklearn.linear_model import RandomizedLasso
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import Lasso
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error as mse
from sklearn.cross_validation import cross_val_predict
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score
from sklearn.pipeline import FeatureUnion
from sklearn import preprocessing
from sklearn import linear_model
from sklearn.decomposition import PCA
from sklearn.feature_selection import chi2
from sklearn.preprocessing import FunctionTransformer
from scipy import stats
from sklearn.linear_model import ElasticNet


C:\Users\mjsteele\Anaconda3\lib\site-packages\sklearn\cross_validation.py:44: DeprecationWarning: This module was deprecated in version 0.18 in favor of the model_selection module into which all the refactored classes and functions are moved. Also note that the interface of the new CV iterators are different from that of this module. This module will be removed in 0.20.
  "This module will be removed in 0.20.", DeprecationWarning)

In [2]:
#Import Data
data = pd.read_csv('D:\\yelp\\data\\Final_Regression_Data\\Final\\the_final_countdown-1\\the_final_countdown.csv')

In [3]:
data.head()


Out[3]:
_id restaurant_name address_full business_id review_count inspection_date stars latitude longitude violations ... LasVegas neighborhood0 neighborhood1 neighborhood2 neighborhood3 neighborhood4 neighborhood5 neighborhood6 PreviousViolations DiffPreviousTwo
0 ObjectId(5830680bf3f071f6de30b1d0) GRASSHOPPER VEGETARIAN 1 N Beacon ST Allston 02134 MiOurH3MHs6CwA6iOWehOQ 424 8/4/2008 4.0 42.35377 -71.137418 8 ... 0 0 0 0 0 0 0 0 0 0
1 ObjectId(5830680bf3f071f6de30b1d0) GRASSHOPPER VEGETARIAN 1 N Beacon ST Allston 02134 MiOurH3MHs6CwA6iOWehOQ 424 8/18/2008 4.0 42.35377 -71.137418 8 ... 0 0 0 0 0 0 0 0 8 0
2 ObjectId(5830680bf3f071f6de30b1d0) GRASSHOPPER VEGETARIAN 1 N Beacon ST Allston 02134 MiOurH3MHs6CwA6iOWehOQ 424 7/13/2009 4.0 42.35377 -71.137418 4 ... 0 0 0 0 0 0 0 0 8 0
3 ObjectId(5830680bf3f071f6de30b1d0) GRASSHOPPER VEGETARIAN 1 N Beacon ST Allston 02134 MiOurH3MHs6CwA6iOWehOQ 424 7/27/2009 4.0 42.35377 -71.137418 4 ... 0 0 0 0 0 0 0 0 4 0
4 ObjectId(5830680bf3f071f6de30b1d0) GRASSHOPPER VEGETARIAN 1 N Beacon ST Allston 02134 MiOurH3MHs6CwA6iOWehOQ 424 6/3/2010 4.0 42.35377 -71.137418 12 ... 0 0 0 0 0 0 0 0 4 0

5 rows × 68 columns


In [4]:
data.shape


Out[4]:
(34991, 68)

In [5]:
data.describe()


Out[5]:
review_count stars latitude longitude violations ChangeInViolations IsAsian IsFrench IsSandwiches IsFastFood ... LasVegas neighborhood0 neighborhood1 neighborhood2 neighborhood3 neighborhood4 neighborhood5 neighborhood6 PreviousViolations DiffPreviousTwo
count 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 ... 34991.000000 34991.0 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000 34991.000000
mean 121.214227 3.601626 39.289480 -85.464582 6.934669 1.555171 0.192650 0.008459 0.113629 0.050699 ... 0.290389 0.0 0.581807 0.521363 0.663371 0.551399 0.350804 0.443228 6.326484 -0.045297
std 178.588309 0.700143 3.355512 19.216630 8.340175 5.429591 0.394386 0.091586 0.317365 0.219385 ... 0.453948 0.0 0.493269 0.499551 0.472564 0.497358 0.477229 0.496774 8.282943 3.511681
min 3.000000 1.000000 33.000000 -115.000000 0.000000 -42.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 -98.000000
25% 21.000000 3.000000 36.000000 -115.000000 2.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.0 0.000000 0.000000 0.000000 0.000000 0.000000 0.000000 1.000000 0.000000
50% 60.000000 4.000000 42.292474 -71.137565 5.000000 0.000000 0.000000 0.000000 0.000000 0.000000 ... 0.000000 0.0 1.000000 1.000000 1.000000 1.000000 0.000000 0.000000 5.000000 0.000000
75% 148.000000 4.000000 42.349124 -71.068237 9.000000 1.000000 0.000000 0.000000 0.000000 0.000000 ... 1.000000 0.0 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 8.000000 0.000000
max 1922.000000 5.000000 42.389913 -70.996696 100.000000 100.000000 1.000000 1.000000 1.000000 1.000000 ... 1.000000 0.0 1.000000 1.000000 1.000000 1.000000 1.000000 1.000000 100.000000 100.000000

8 rows × 63 columns


In [6]:
x = data['stars']
y = data['review_count']
plt.ylabel('review_count')
plt.xlabel('stars')
plt.scatter(x,y)
plt.title('Average number of reviews by star ratings')
print('The average number of stars is:', x.mean(), 'reviews:', y.mean())


The average number of stars is: 3.6016261324340544 reviews: 121.21422651538967

In [7]:
x = data['review_count']
x.xlabel= ('review_count')
x.hist(bins=500, range = [0, 1000])


Out[7]:
<matplotlib.axes._subplots.AxesSubplot at 0x23d9851bb38>

In [8]:
x = data['review_count']
x.xlabel= ('review_count')
x.hist(bins=500, range = [0, 500])
plt.title('Distribution of review counts')
print('Mean number of reviews:', x.mean())


Mean number of reviews: 121.21422651538967

In [9]:
x = data['violations']
plt.xlabel= ('violations')
x.hist(bins=50, range = [0, 50])
plt.title('Distribution of violation counts')
print('The mean number of violations is:', x.mean(), 'Standard deviation', x.std())


The mean number of violations is: 6.934668914863822 Standard deviation 8.340175123804492

In [10]:
data.boxplot('violations', by='stars', figsize =(10,8))
plt.ylim(0,25)


Out[10]:
(0, 25)

In [11]:
vhist = data.boxplot(['violations'], by ='LasVegas', figsize=(5,5))
vhist.set_ylim(0,25), 
print('Number of Las Vegas instances:', data['LasVegas'].sum())
vhist2 = data.boxplot(['violations'], by = 'Charlotte', figsize=(5,5))
vhist2.set_ylim(0,25)
print('Number of Charlotte Instances:', data['Charlotte'].sum())
vhist3 = data.boxplot(['violations'], by = 'Boston', figsize=(5,5))
vhist3.set_ylim(0,25)
print('Number of Boston Instances:', data['Boston'].sum())


Number of Las Vegas instances: 10161
Number of Charlotte Instances: 5758
Number of Boston Instances: 19072

In [12]:
sns.pairplot(data, x_vars=['review_count','PreviousViolations', 'DiffPreviousTwo'], y_vars='violations', size=7, aspect=.7)


Out[12]:
<seaborn.axisgrid.PairGrid at 0x23d98995fd0>

In [13]:
#scaling continuous independent variables, with mean zero
sdf = data.copy()
scaled = np.array(sdf[['review_count', 'stars', 'pricerange', 'PreviousViolations', 'DiffPreviousTwo']])
scaled = preprocessing.scale(scaled)
# scaled.shape
final = pd.DataFrame(data = scaled,
                     columns =['review_count', 'stars', 'pricerange', 'PreviousViolations', 'DiffPreviousTwo'])

delist = ['review_count', 'stars', 'pricerange', 'PreviousViolations', 'DiffPreviousTwo']
ndf = data.copy()
for i in delist:
    del ndf[i]
# ndf.shape
df = pd.concat([final, ndf], axis=1)

df.shape


Out[13]:
(34991, 68)

In [14]:
variables = df[['PreviousViolations','DiffPreviousTwo','IsAsian','IsFrench','IsSandwiches',
                 'IsFastFood','IsBurgers','IsItalian','IsHawaiian','IsSouthern','IsMexican','IsLatinAmerican','IsMiddleEastern',
                 'IsGreek','IsAmerican','IsDonuts','IsIndian','IsSeafood','IsDesserts','IsSalad','Pizza','IsBuffets',
                 'IsSushiBars','IsDelis','IsSports Bars','IsBakeries','IsPubs','IsCaterers','IsDiners','IsCafes','IsBars',
                 'alcohol','delivery','dogsallowed','smoking','goodforkids','outdoorseating','waiterservice','creditcards',
                 'pricerange','drivethru','tourist','classy','hipster','latenight','upscale','divey','Boston','Charlotte',
                 'LasVegas','neighborhood0','neighborhood1','neighborhood2','neighborhood3','neighborhood4','neighborhood5','neighborhood6']]
target = df['violations']
variables.shape
print(target.shape, variables.shape)


(34991,) (34991, 57)

In [37]:
#Train test splits
splits = cv.train_test_split(variables, target, test_size=0.083)
X_train, X_test, y_train, y_test = splits

In [16]:
#Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Linear Regression model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))



#Plot measured vs. predicted values
fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [target.min(), target.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Linear Regression')
plt.show()


Linear Regression model
Mean Squared Error: 62.750
Coefficient of Determination: 0.072

In [17]:
labels = np.array(['Variable', 'Coefficient'])
coefs1 = pd.DataFrame(list(zip(variables, model.coef_)), columns = labels)
coefs1


Out[17]:
Variable Coefficient
0 PreviousViolations 3.050261e-01
1 DiffPreviousTwo 3.784858e-01
2 IsAsian 1.898771e+00
3 IsFrench 3.478446e-01
4 IsSandwiches 8.128180e-02
5 IsFastFood -7.015571e-01
6 IsBurgers 1.004427e-01
7 IsItalian 8.414192e-02
8 IsHawaiian 7.161296e-01
9 IsSouthern -2.199613e-01
10 IsMexican 5.746720e-01
11 IsLatinAmerican -5.461959e+13
12 IsMiddleEastern -2.461441e-01
13 IsGreek -3.323098e-01
14 IsAmerican -6.660949e-01
15 IsDonuts -1.422086e+00
16 IsIndian 1.042908e+00
17 IsSeafood 7.693909e-01
18 IsDesserts -1.196594e+00
19 IsSalad -1.610575e-01
20 Pizza 2.335371e-01
21 IsBuffets 2.691680e+00
22 IsSushiBars -1.959432e-01
23 IsDelis -1.660592e-01
24 IsSports Bars 1.420476e+00
25 IsBakeries 5.375049e-02
26 IsPubs 4.329219e-01
27 IsCaterers -4.395413e-01
28 IsDiners 1.430168e+00
29 IsCafes -1.185083e+00
30 IsBars 5.882748e-01
31 alcohol 4.651887e-01
32 delivery 3.422767e-01
33 dogsallowed -2.429801e-01
34 smoking -8.972433e-01
35 goodforkids 1.016857e+00
36 outdoorseating -8.034210e-02
37 waiterservice 2.174226e-01
38 creditcards -2.200620e-01
39 pricerange 1.076916e-01
40 drivethru -6.151114e-01
41 tourist -2.532856e+00
42 classy -2.897845e-01
43 hipster -3.210836e-01
44 latenight 1.446938e+00
45 upscale -1.494549e+00
46 divey 1.777851e-01
47 Boston 3.933068e+13
48 Charlotte 3.933068e+13
49 LasVegas 3.933068e+13
50 neighborhood0 6.472870e+00
51 neighborhood1 -3.123024e-01
52 neighborhood2 2.430806e-02
53 neighborhood3 5.805499e-01
54 neighborhood4 4.208388e-02
55 neighborhood5 -2.038323e-01
56 neighborhood6 3.647998e-01

In [18]:
# #Remove outliers that have high violation (> 4 standard deviations)
o = df.copy()
odf = o[((o.violations - o.violations.mean()) / o.violations.std()).abs() < 4]
odf.shape


Out[18]:
(34793, 68)

In [19]:
#Select variables and target for cross validation
variables = odf[['PreviousViolations','DiffPreviousTwo','IsAsian','IsFrench','IsSandwiches',
                 'IsFastFood','IsBurgers','IsItalian','IsHawaiian','IsSouthern','IsMexican','IsLatinAmerican','IsMiddleEastern',
                 'IsGreek','IsAmerican','IsDonuts','IsIndian','IsSeafood','IsDesserts','IsSalad','Pizza','IsBuffets',
                 'IsSushiBars','IsDelis','IsSports Bars','IsBakeries','IsPubs','IsCaterers','IsDiners','IsCafes','IsBars',
                 'alcohol','delivery','dogsallowed','smoking','goodforkids','outdoorseating','waiterservice','creditcards',
                 'pricerange','drivethru','tourist','classy','hipster','latenight','upscale','divey','Boston','Charlotte',
                 'LasVegas','neighborhood0','neighborhood1','neighborhood2','neighborhood3','neighborhood4','neighborhood5','neighborhood6']]
target = odf['violations']
print('variables:', variables.shape, 'target:', target.shape)


variables: (34793, 57) target: (34793,)

In [20]:
#Train test splits
splits = cv.train_test_split(variables, target, test_size=0.83)
X_train, X_test, y_train, y_test = splits

In [21]:
#Linear Regression
model = LinearRegression()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Linear Regression model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))



#Plot measured vs. predicted values
fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [target.min(), target.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Linear Regression Outliers Removed')
plt.show()


Linear Regression model
Mean Squared Error: 31.979
Coefficient of Determination: 0.093

In [22]:
labels = np.array(['Variable', 'Coefficient'])
coefs2 = pd.DataFrame(list(zip(variables, model.coef_)), columns = labels)
coefs2


Out[22]:
Variable Coefficient
0 PreviousViolations 6.870151e-01
1 DiffPreviousTwo 3.592239e-01
2 IsAsian 1.475796e+00
3 IsFrench 1.154118e+00
4 IsSandwiches -2.411967e-01
5 IsFastFood -1.785272e-01
6 IsBurgers -9.674334e-01
7 IsItalian -3.676471e-02
8 IsHawaiian 1.192851e+00
9 IsSouthern 3.197179e-01
10 IsMexican 8.700916e-01
11 IsLatinAmerican -7.975551e+10
12 IsMiddleEastern -9.894662e-02
13 IsGreek 9.014200e-01
14 IsAmerican 1.693683e-01
15 IsDonuts -5.352463e-01
16 IsIndian 9.663190e-01
17 IsSeafood 8.501395e-01
18 IsDesserts -1.576016e+00
19 IsSalad -6.389172e-01
20 Pizza -4.059348e-01
21 IsBuffets 1.027609e+00
22 IsSushiBars -2.056971e-01
23 IsDelis -4.478304e-01
24 IsSports Bars 3.560668e-01
25 IsBakeries 5.485470e-01
26 IsPubs -4.533590e-01
27 IsCaterers -1.115086e+00
28 IsDiners 1.341249e+00
29 IsCafes -2.049409e-01
30 IsBars 4.211068e-01
31 alcohol 3.878248e-01
32 delivery 6.773240e-01
33 dogsallowed -7.704256e-01
34 smoking -7.062395e-01
35 goodforkids 7.559268e-01
36 outdoorseating -2.503752e-01
37 waiterservice 2.823931e-01
38 creditcards -5.601161e-01
39 pricerange 1.233490e-01
40 drivethru -2.685093e-03
41 tourist -1.476798e+00
42 classy -5.595853e-01
43 hipster -8.133409e-01
44 latenight 1.127143e+00
45 upscale 9.198683e-01
46 divey 2.467131e-01
47 Boston -4.449975e+12
48 Charlotte -4.449975e+12
49 LasVegas -4.449975e+12
50 neighborhood0 -1.087952e-02
51 neighborhood1 -2.434073e-01
52 neighborhood2 2.120940e-01
53 neighborhood3 3.043577e-01
54 neighborhood4 2.385643e-01
55 neighborhood5 4.557895e-02
56 neighborhood6 2.531174e-01

In [23]:
#Ridge Regression
model = Ridge(alpha=.1)
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Ridge Regression model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))



#Plot values
fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [target.min(), target.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Ridge Regression')
plt.show()


Ridge Regression model
Mean Squared Error: 31.978
Coefficient of Determination: 0.093

In [24]:
labels = np.array(['Variable', 'Coefficient'])
coefs3 = pd.DataFrame(list(zip(variables, model.coef_)), columns = labels)
coefs3


Out[24]:
Variable Coefficient
0 PreviousViolations 0.687409
1 DiffPreviousTwo 0.358654
2 IsAsian 1.476598
3 IsFrench 1.154147
4 IsSandwiches -0.238188
5 IsFastFood -0.177453
6 IsBurgers -0.967543
7 IsItalian -0.038133
8 IsHawaiian 1.193894
9 IsSouthern 0.319172
10 IsMexican 0.872171
11 IsLatinAmerican 0.000000
12 IsMiddleEastern -0.092791
13 IsGreek 0.898788
14 IsAmerican 0.169575
15 IsDonuts -0.536633
16 IsIndian 0.967394
17 IsSeafood 0.848367
18 IsDesserts -1.571790
19 IsSalad -0.639429
20 Pizza -0.405995
21 IsBuffets 1.024416
22 IsSushiBars -0.202489
23 IsDelis -0.445738
24 IsSports Bars 0.354754
25 IsBakeries 0.545827
26 IsPubs -0.452381
27 IsCaterers -1.112549
28 IsDiners 1.340800
29 IsCafes -0.203632
30 IsBars 0.419797
31 alcohol 0.387949
32 delivery 0.676360
33 dogsallowed -0.768679
34 smoking -0.704693
35 goodforkids 0.755999
36 outdoorseating -0.248823
37 waiterservice 0.282309
38 creditcards -0.558056
39 pricerange 0.123094
40 drivethru -0.003315
41 tourist -1.473983
42 classy -0.560113
43 hipster -0.814321
44 latenight 1.125969
45 upscale 0.917370
46 divey 0.248160
47 Boston -1.461837
48 Charlotte 0.217464
49 LasVegas 1.244374
50 neighborhood0 0.000000
51 neighborhood1 -0.242788
52 neighborhood2 0.212127
53 neighborhood3 0.304616
54 neighborhood4 0.237835
55 neighborhood5 0.046535
56 neighborhood6 0.252809

In [26]:
# Investigate alpha level for Ridge Regression Model
n_alphas = 200
alphas = np.logspace(-200, 200, n_alphas)
model = linear_model.RidgeCV(alphas = alphas)
model.fit(X_train, y_train)

expected = y_test
predict = model.predict(X_test)
print ('Alpha chosen:', model.alpha_, 'Score:', model.score(X_test, y_test))



#Plot values
fig, ax = plt.subplots()
ax.scatter(target, predicted)
ax.plot([target.min(), target.max()], [target.min(), target.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Ridge Regression')
plt.show()


Alpha chosen: 10.1163797977 Score: 0.0937992798088

In [27]:
#Lasso Regression
model = Lasso()

model.fit(X_train, y_train)

expected  = y_test
predicted = model.predict(X_test)

# Evaluate fit of the model
print("Lasso Regression model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))


#Plot values
fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [target.min(), target.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Lasso Regression')
plt.show()


Lasso Regression model
Mean Squared Error: 34.958
Coefficient of Determination: 0.009

In [28]:
labels = np.array(['Variable', 'Coefficient'])
coefs4 = pd.DataFrame(list(zip(variables, model.coef_)), columns = labels)
coefs4


Out[28]:
Variable Coefficient
0 PreviousViolations 0.19228
1 DiffPreviousTwo 0.00000
2 IsAsian 0.00000
3 IsFrench 0.00000
4 IsSandwiches -0.00000
5 IsFastFood -0.00000
6 IsBurgers -0.00000
7 IsItalian -0.00000
8 IsHawaiian 0.00000
9 IsSouthern 0.00000
10 IsMexican 0.00000
11 IsLatinAmerican 0.00000
12 IsMiddleEastern -0.00000
13 IsGreek 0.00000
14 IsAmerican -0.00000
15 IsDonuts -0.00000
16 IsIndian 0.00000
17 IsSeafood 0.00000
18 IsDesserts -0.00000
19 IsSalad -0.00000
20 Pizza -0.00000
21 IsBuffets 0.00000
22 IsSushiBars 0.00000
23 IsDelis -0.00000
24 IsSports Bars 0.00000
25 IsBakeries -0.00000
26 IsPubs -0.00000
27 IsCaterers -0.00000
28 IsDiners 0.00000
29 IsCafes -0.00000
30 IsBars 0.00000
31 alcohol -0.00000
32 delivery 0.00000
33 dogsallowed -0.00000
34 smoking 0.00000
35 goodforkids 0.00000
36 outdoorseating -0.00000
37 waiterservice 0.00000
38 creditcards 0.00000
39 pricerange 0.00000
40 drivethru -0.00000
41 tourist -0.00000
42 classy -0.00000
43 hipster -0.00000
44 latenight 0.00000
45 upscale 0.00000
46 divey 0.00000
47 Boston -0.00000
48 Charlotte 0.00000
49 LasVegas 0.00000
50 neighborhood0 0.00000
51 neighborhood1 0.00000
52 neighborhood2 0.00000
53 neighborhood3 0.00000
54 neighborhood4 0.00000
55 neighborhood5 0.00000
56 neighborhood6 0.00000

In [29]:
model = ElasticNet()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Random Forest model")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print("R2 score = %0.3f" % r2_score(expected, predicted))



fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [target.min(), target.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Elastic Net')
plt.show()


Random Forest model
Mean squared error = 34.489
R2 score = 0.022

In [30]:
labels = np.array(['Variable', 'Coefficient'])
coefs5 = pd.DataFrame(list(zip(variables, model.coef_)), columns = labels)
coefs5


Out[30]:
Variable Coefficient
0 PreviousViolations 0.451774
1 DiffPreviousTwo 0.000000
2 IsAsian 0.000000
3 IsFrench 0.000000
4 IsSandwiches -0.000000
5 IsFastFood -0.000000
6 IsBurgers -0.000000
7 IsItalian -0.000000
8 IsHawaiian 0.000000
9 IsSouthern 0.000000
10 IsMexican 0.000000
11 IsLatinAmerican 0.000000
12 IsMiddleEastern -0.000000
13 IsGreek 0.000000
14 IsAmerican -0.000000
15 IsDonuts -0.000000
16 IsIndian 0.000000
17 IsSeafood 0.000000
18 IsDesserts -0.000000
19 IsSalad -0.000000
20 Pizza -0.000000
21 IsBuffets 0.000000
22 IsSushiBars 0.000000
23 IsDelis -0.000000
24 IsSports Bars 0.000000
25 IsBakeries -0.000000
26 IsPubs -0.000000
27 IsCaterers -0.000000
28 IsDiners 0.000000
29 IsCafes -0.000000
30 IsBars 0.000000
31 alcohol 0.000000
32 delivery 0.000000
33 dogsallowed -0.000000
34 smoking 0.000000
35 goodforkids 0.000000
36 outdoorseating -0.000000
37 waiterservice 0.000000
38 creditcards 0.000000
39 pricerange 0.000000
40 drivethru -0.000000
41 tourist -0.000000
42 classy -0.000000
43 hipster -0.000000
44 latenight 0.000000
45 upscale 0.000000
46 divey 0.000000
47 Boston -0.142898
48 Charlotte 0.000000
49 LasVegas 0.000000
50 neighborhood0 0.000000
51 neighborhood1 0.000000
52 neighborhood2 0.000000
53 neighborhood3 0.000000
54 neighborhood4 0.000000
55 neighborhood5 0.000000
56 neighborhood6 0.000000

In [31]:
model = RandomForestRegressor()
model.fit(X_train, y_train)

expected = y_test
predicted = model.predict(X_test)

print("Random Forest model")
print("Mean squared error = %0.3f" % mse(expected, predicted))
print("R2 score = %0.3f" % r2_score(expected, predicted))



fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [target.min(), target.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Random Forest')
plt.show()


Random Forest model
Mean squared error = 33.721
R2 score = 0.044

In [38]:
polypipe = Pipeline([('Polynomial', PolynomialFeatures(2)),
                     ('LinearRegression', LinearRegression())])
        
polypipe.fit(X_train, y_train)

expected = y_test
predicted = polypipe.predict(X_test)

print("Linear Regression model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))


fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [predicted.min(), predicted.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title ('Polynomial 2nd Regression')
plt.show()


Linear Regression model
Mean Squared Error: 31.256
Coefficient of Determination: 0.173

In [34]:
labels = np.array(['Variable', 'Coefficient'])
coefs6 = pd.DataFrame(list(zip(variables, polypipe.named_steps['LinearRegression'].coef_)), columns = labels)
coefs6


Out[34]:
Variable Coefficient
0 PreviousViolations -3.258138e+07
1 DiffPreviousTwo -1.027484e+10
2 IsAsian 5.656899e+10
3 IsFrench -2.312191e+10
4 IsSandwiches 2.500166e+10
5 IsFastFood -3.879142e+09
6 IsBurgers 3.296400e+10
7 IsItalian 9.019567e+09
8 IsHawaiian -7.549118e+09
9 IsSouthern -4.189203e+09
10 IsMexican 4.796961e+08
11 IsLatinAmerican -5.009411e+08
12 IsMiddleEastern -3.251170e+09
13 IsGreek -3.201290e+09
14 IsAmerican -7.556746e+09
15 IsDonuts 2.591343e+08
16 IsIndian -1.034474e+09
17 IsSeafood -5.615458e+09
18 IsDesserts 2.506461e+09
19 IsSalad -5.703510e+09
20 Pizza -5.052894e+09
21 IsBuffets 1.031796e+09
22 IsSushiBars 8.495870e+08
23 IsDelis 9.834595e+08
24 IsSports Bars 1.550250e+09
25 IsBakeries -2.240768e+09
26 IsPubs 1.356029e+08
27 IsCaterers 4.441474e+09
28 IsDiners -7.809688e+08
29 IsCafes -1.551970e+09
30 IsBars 4.972032e+08
31 alcohol 1.767504e+09
32 delivery -8.025599e+08
33 dogsallowed 4.351489e+08
34 smoking -6.668421e+08
35 goodforkids -3.350801e+08
36 outdoorseating 1.826866e+09
37 waiterservice -2.995990e+08
38 creditcards -3.432583e+09
39 pricerange -5.070179e+07
40 drivethru -3.409175e+09
41 tourist 4.729005e+08
42 classy 6.142718e+08
43 hipster -1.364332e+09
44 latenight 3.392369e+08
45 upscale -1.253697e+09
46 divey 7.129742e+08
47 Boston -1.340641e+09
48 Charlotte 1.114100e+09
49 LasVegas -1.076058e+09
50 neighborhood0 -1.927169e+09
51 neighborhood1 1.105097e+09
52 neighborhood2 -9.065547e+08
53 neighborhood3 -1.342486e+09
54 neighborhood4 1.004575e+09
55 neighborhood5 3.550831e+09
56 neighborhood6 -8.337425e+07

In [36]:
featurepipe = Pipeline([('Dimension Reduction', PCA()),
                        ('Random Forest', RandomForestRegressor())])
            
featurepipe.fit(X_train, y_train)

expected = y_test
predicted = featurepipe.predict(X_test)

print("Random Forest model")
print("Mean Squared Error: %0.3f" % mse(expected, predicted))
print("Coefficient of Determination: %0.3f" % r2_score(expected, predicted))

#Plot values
fig, ax = plt.subplots()
ax.scatter(expected, predicted)
ax.plot([target.min(), target.max()], [predicted.min(), predicted.max()], 'k--', lw=4)
ax.set_xlabel('Measured')
ax.set_ylabel('Predicted')
plt.title('Random Forest PCA')
plt.show()


Random Forest model
Mean Squared Error: 34.151
Coefficient of Determination: 0.031