Initialise libraries



In [37]:

    
import pandas as pa
import matplotlib.pyplot as plt
import numpy as np
from sklearn import linear_model

Load house value vs. crime rate data

Dataset is from Philadelphia, PA and includes average house sales price in a number of neighborhoods. The attributes of each neighborhood we have include the crime rate ('CrimeRate'), miles from Center City ('MilesPhila'), town name ('Name'), and county name ('County').



In [20]:

    
regressionDir = '/home/weenkus/workspace/Machine Learning - University of Washington/Regression'
sales = pa.read_csv(regressionDir + '/datasets/Philadelphia_Crime_Rate_noNA.csv')
sales









    Out[20]:






  
    
      
      HousePrice
      HsPrc ($10,000)
      CrimeRate
      MilesPhila
      PopChg
      Name
      County
    
  
  
    
      0
      140463
      14.0463
      29.7
      10
      -1.0
      Abington
      Montgome
    
    
      1
      113033
      11.3033
      24.1
      18
      4.0
      Ambler
      Montgome
    
    
      2
      124186
      12.4186
      19.5
      25
      8.0
      Aston
      Delaware
    
    
      3
      110490
      11.0490
      49.4
      25
      2.7
      Bensalem
      Bucks
    
    
      4
      79124
      7.9124
      54.1
      19
      3.9
      Bristol B.
      Bucks
    
    
      5
      92634
      9.2634
      48.6
      20
      0.6
      Bristol T.
      Bucks
    
    
      6
      89246
      8.9246
      30.8
      15
      -2.6
      Brookhaven
      Delaware
    
    
      7
      195145
      19.5145
      10.8
      20
      -3.5
      Bryn Athyn
      Montgome
    
    
      8
      297342
      29.7342
      20.2
      14
      0.6
      Bryn Mawr
      Montgome
    
    
      9
      264298
      26.4298
      20.4
      26
      6.0
      Buckingham
      Bucks
    
    
      10
      134342
      13.4342
      17.3
      31
      4.2
      Chalfont
      Bucks
    
    
      11
      147600
      14.7600
      50.3
      9
      -1.0
      Cheltenham
      Montgome
    
    
      12
      77370
      7.7370
      34.2
      10
      -1.2
      Clifton
      Delaware
    
    
      13
      170822
      17.0822
      33.7
      32
      2.4
      Collegeville
      Montgome
    
    
      14
      40642
      4.0642
      45.7
      15
      0.0
      Darby Bor.
      Delaware
    
    
      15
      71359
      7.1359
      22.3
      8
      1.6
      Darby Town
      Delaware
    
    
      16
      104923
      10.4923
      48.1
      21
      6.9
      Downingtown
      Chester
    
    
      17
      190317
      19.0317
      19.4
      26
      1.9
      Doylestown
      Bucks
    
    
      18
      215512
      21.5512
      71.9
      26
      5.8
      E. Bradford
      Chester
    
    
      19
      178105
      17.8105
      45.1
      25
      2.3
      E. Goshen
      Chester
    
    
      20
      131025
      13.1025
      31.3
      19
      -1.8
      E. Norriton
      Montgome
    
    
      21
      149844
      14.9844
      24.9
      22
      6.4
      E. Pikeland
      Chester
    
    
      22
      170556
      17.0556
      27.2
      30
      4.6
      E. Whiteland
      Chester
    
    
      23
      280969
      28.0969
      17.7
      14
      2.9
      Easttown
      Chester
    
    
      24
      114233
      11.4233
      29.0
      30
      1.3
      Falls Town
      Bucks
    
    
      25
      74502
      7.4502
      21.4
      15
      -3.2
      Follcroft
      Delaware
    
    
      26
      475112
      47.5112
      28.6
      12
      NaN
      Gladwyne
      Montgome
    
    
      27
      97167
      9.7167
      29.3
      10
      0.2
      Glenolden
      Delaware
    
    
      28
      114572
      11.4572
      17.5
      20
      5.2
      Hatboro
      Montgome
    
    
      29
      436348
      43.6348
      16.5
      10
      -0.7
      Haverford
      Delaware
    
    
      ...
      ...
      ...
      ...
      ...
      ...
      ...
      ...
    
    
      69
      100231
      10.0231
      24.1
      15
      1.9
      Ridley Town
      Delaware
    
    
      70
      95831
      9.5831
      21.2
      32
      3.2
      Royersford
      Montgome
    
    
      71
      229711
      22.9711
      9.8
      22
      5.3
      Schuylkill
      Chester
    
    
      72
      74308
      7.4308
      29.9
      7
      1.8
      Sharon Hill
      Delaware
    
    
      73
      259506
      25.9506
      7.2
      40
      17.4
      Solebury
      Bucks
    
    
      74
      159573
      15.9573
      19.4
      15
      -2.1
      Springfield
      Montgome
    
    
      75
      147176
      14.7176
      41.1
      12
      -1.7
      Springfield
      Delaware
    
    
      76
      205732
      20.5732
      11.2
      12
      -0.2
      Swarthmore
      Delaware
    
    
      77
      215783
      21.5783
      21.2
      20
      1.1
      Tredyffin
      Chester
    
    
      78
      116710
      11.6710
      42.8
      20
      12.9
      U. Chichester
      Delaware
    
    
      79
      359112
      35.9112
      9.4
      36
      4.0
      U. Makefield
      Bucks
    
    
      80
      189959
      18.9959
      61.7
      22
      -2.1
      U. Merion
      Montgome
    
    
      81
      133198
      13.3198
      19.4
      22
      -2.0
      U. Moreland
      Montgome
    
    
      82
      242821
      24.2821
      6.6
      21
      1.6
      U. Providence
      Delaware
    
    
      83
      142811
      14.2811
      15.9
      20
      -1.6
      U. Southampton
      Bucks
    
    
      84
      200498
      20.0498
      18.8
      36
      11.0
      U. Uwchlan
      Chester
    
    
      85
      199065
      19.9065
      13.2
      20
      7.8
      Upper Darby
      Montgome
    
    
      86
      93648
      9.3648
      34.5
      8
      -0.7
      Upper Darby
      Delaware
    
    
      87
      163001
      16.3001
      22.1
      50
      8.0
      Uwchlan T.
      Chester
    
    
      88
      436348
      43.6348
      22.1
      15
      1.3
      Villanova
      Montgome
    
    
      89
      124478
      12.4478
      71.9
      22
      4.6
      W. Chester
      Chester
    
    
      90
      168276
      16.8276
      31.9
      26
      5.9
      W. Goshen
      Chester
    
    
      91
      114157
      11.4157
      44.6
      38
      14.6
      W. Whiteland
      Chester
    
    
      92
      130088
      13.0088
      28.6
      19
      -0.2
      Warminster
      Bucks
    
    
      93
      152624
      15.2624
      24.0
      19
      23.1
      Warrington
      Bucks
    
    
      94
      174232
      17.4232
      13.8
      25
      4.7
      Westtown
      Chester
    
    
      95
      196515
      19.6515
      29.9
      16
      1.8
      Whitemarsh
      Montgome
    
    
      96
      232714
      23.2714
      9.9
      21
      0.2
      Willistown
      Chester
    
    
      97
      245920
      24.5920
      22.6
      10
      0.3
      Wynnewood
      Montgome
    
    
      98
      130953
      13.0953
      13.0
      24
      5.2
      Yardley
      Bucks
    
  

99 rows × 7 columns



In [21]:

    
# Show plots in jupyter
%matplotlib inline

Exploring the data

The house price in a town is correlated with the crime rate of that town. Low crime towns tend to be associated with higher house prices and vice versa.



In [34]:

    
plt.scatter(sales.CrimeRate, sales.HousePrice, alpha=0.5)
plt.ylabel('House price')
plt.xlabel('Crime rate')









    Out[34]:





<matplotlib.text.Text at 0x7fae71ffa4e0>

Fit the regression model using crime as the feature



In [77]:

    
# Check the type and shape
X = sales[['CrimeRate']]
print (type(X))
print (X.shape)


y = sales['HousePrice']
print (type(y))
print (y.shape)









    



<class 'pandas.core.frame.DataFrame'>
(99, 1)
<class 'pandas.core.series.Series'>
(99,)



In [101]:

    
crime_model = linear_model.LinearRegression()
crime_model.fit(X, y)









    Out[101]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

Let's see what our fit looks like



In [88]:

    
plt.plot(sales.CrimeRate, sales.HousePrice, '.',
    X, crime_model.predict(X), '-',
         linewidth=3)
plt.ylabel('House price')
plt.xlabel('Crime rate')









    Out[88]:





<matplotlib.text.Text at 0x7fae67e1b908>

Remove Center City and redo the analysis

Center City is the one observation with an extremely high crime rate, yet house prices are not very low. This point does not follow the trend of the rest of the data very well. A question is how much including Center City is influencing our fit on the other datapoints. Let's remove this datapoint and see what happens.



In [90]:

    
sales_noCC = sales[sales['MilesPhila'] != 0.0]



In [92]:

    
plt.scatter(sales_noCC.CrimeRate, sales_noCC.HousePrice, alpha=0.5)
plt.ylabel('House price')
plt.xlabel('Crime rate')









    Out[92]:





<matplotlib.text.Text at 0x7fae67d379e8>



In [112]:

    
crime_model_noCC = linear_model.LinearRegression()
crime_model_noCC.fit(sales_noCC[['CrimeRate']], sales_noCC['HousePrice'])









    Out[112]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [113]:

    
plt.plot(sales_noCC.CrimeRate, sales_noCC.HousePrice, '.',
    sales_noCC[['CrimeRate']], crime_model_noCC.predict(sales_noCC[['CrimeRate']]), '-',
         linewidth=3)
plt.ylabel('House price')
plt.xlabel('Crime rate')









    Out[113]:





<matplotlib.text.Text at 0x7fae67b96898>

Compare coefficients for full-data fit versus no-Center-City fit

Visually, the fit seems different, but let's quantify this by examining the estimated coefficients of our original fit and that of the modified dataset with Center City removed.



In [118]:

    
print ('slope: ', crime_model.coef_)
print ('intercept: ', crime_model.intercept_)









    



slope:  [-576.90812768]
intercept:  176629.408107



In [119]:

    
print ('slope: ', crime_model_noCC.coef_)
print ('intercept: ', crime_model_noCC.intercept_)









    



slope:  [-2288.68942995]
intercept:  225233.551839

Above: We see that for the "no Center City" version, per unit increase in crime, the predicted decrease in house prices is 2,287. In contrast, for the original dataset, the drop is only 576 per unit increase in crime. This is significantly different!

High leverage points:

Center City is said to be a "high leverage" point because it is at an extreme x value where there are not other observations. As a result, recalling the closed-form solution for simple regression, this point has the potential to dramatically change the least squares line since the center of x mass is heavily influenced by this one point and the least squares line will try to fit close to that outlying (in x) point. If a high leverage point follows the trend of the other data, this might not have much effect. On the other hand, if this point somehow differs, it can be strongly influential in the resulting fit.

Influental observations:

An influential observation is one where the removal of the point significantly changes the fit. As discussed above, high leverage points are good candidates for being influential observations, but need not be. Other observations that are not leverage points can also be influential observations (e.g., strongly outlying in y even if x is a typical value).

Remove high-value outlier neighborhoods and redo analysis

Based on the discussion above, a question is whether the outlying high-value towns are strongly influencing the fit. Let's remove them and see what happens.



In [121]:

    
sales_nohighend = sales_noCC[sales_noCC['HousePrice'] < 350000] 
crime_model_nohighhend = linear_model.LinearRegression()
crime_model_nohighhend.fit(sales_nohighend[['CrimeRate']], sales_nohighend['HousePrice'])









    Out[121]:





LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)



In [127]:

    
plt.plot(sales_nohighend.CrimeRate, sales_nohighend.HousePrice, '.',
    sales_nohighend[['CrimeRate']], crime_model_nohighhend.predict(sales_nohighend[['CrimeRate']]), '-',
         linewidth=3)
plt.ylabel('House price') 
plt.xlabel('Crime rate')









    Out[127]:





<matplotlib.text.Text at 0x7fae6807e048>

Do the coefficients change much?



In [123]:

    
print ('slope: ', crime_model_noCC.coef_)
print ('intercept: ', crime_model_noCC.intercept_)









    



slope:  [-2288.68942995]
intercept:  225233.551839



In [124]:

    
print ('slope: ', crime_model_nohighhend.coef_)
print ('intercept: ', crime_model_nohighhend.intercept_)









    



slope:  [-1838.56264859]
intercept:  199098.85267

Above: We see that removing the outlying high-value neighborhoods has some effect on the fit, but not nearly as much as our high-leverage Center City datapoint.



In [ ]:

	HousePrice	HsPrc ($10,000)	CrimeRate	MilesPhila	PopChg	Name	County
0	140463	14.0463	29.7	10	-1.0	Abington	Montgome
1	113033	11.3033	24.1	18	4.0	Ambler	Montgome
2	124186	12.4186	19.5	25	8.0	Aston	Delaware
3	110490	11.0490	49.4	25	2.7	Bensalem	Bucks
4	79124	7.9124	54.1	19	3.9	Bristol B.	Bucks
5	92634	9.2634	48.6	20	0.6	Bristol T.	Bucks
6	89246	8.9246	30.8	15	-2.6	Brookhaven	Delaware
7	195145	19.5145	10.8	20	-3.5	Bryn Athyn	Montgome
8	297342	29.7342	20.2	14	0.6	Bryn Mawr	Montgome
9	264298	26.4298	20.4	26	6.0	Buckingham	Bucks
10	134342	13.4342	17.3	31	4.2	Chalfont	Bucks
11	147600	14.7600	50.3	9	-1.0	Cheltenham	Montgome
12	77370	7.7370	34.2	10	-1.2	Clifton	Delaware
13	170822	17.0822	33.7	32	2.4	Collegeville	Montgome
14	40642	4.0642	45.7	15	0.0	Darby Bor.	Delaware
15	71359	7.1359	22.3	8	1.6	Darby Town	Delaware
16	104923	10.4923	48.1	21	6.9	Downingtown	Chester
17	190317	19.0317	19.4	26	1.9	Doylestown	Bucks
18	215512	21.5512	71.9	26	5.8	E. Bradford	Chester
19	178105	17.8105	45.1	25	2.3	E. Goshen	Chester
20	131025	13.1025	31.3	19	-1.8	E. Norriton	Montgome
21	149844	14.9844	24.9	22	6.4	E. Pikeland	Chester
22	170556	17.0556	27.2	30	4.6	E. Whiteland	Chester
23	280969	28.0969	17.7	14	2.9	Easttown	Chester
24	114233	11.4233	29.0	30	1.3	Falls Town	Bucks
25	74502	7.4502	21.4	15	-3.2	Follcroft	Delaware
26	475112	47.5112	28.6	12	NaN	Gladwyne	Montgome
27	97167	9.7167	29.3	10	0.2	Glenolden	Delaware
28	114572	11.4572	17.5	20	5.2	Hatboro	Montgome
29	436348	43.6348	16.5	10	-0.7	Haverford	Delaware
...	...	...	...	...	...	...	...
69	100231	10.0231	24.1	15	1.9	Ridley Town	Delaware
70	95831	9.5831	21.2	32	3.2	Royersford	Montgome
71	229711	22.9711	9.8	22	5.3	Schuylkill	Chester
72	74308	7.4308	29.9	7	1.8	Sharon Hill	Delaware
73	259506	25.9506	7.2	40	17.4	Solebury	Bucks
74	159573	15.9573	19.4	15	-2.1	Springfield	Montgome
75	147176	14.7176	41.1	12	-1.7	Springfield	Delaware
76	205732	20.5732	11.2	12	-0.2	Swarthmore	Delaware
77	215783	21.5783	21.2	20	1.1	Tredyffin	Chester
78	116710	11.6710	42.8	20	12.9	U. Chichester	Delaware
79	359112	35.9112	9.4	36	4.0	U. Makefield	Bucks
80	189959	18.9959	61.7	22	-2.1	U. Merion	Montgome
81	133198	13.3198	19.4	22	-2.0	U. Moreland	Montgome
82	242821	24.2821	6.6	21	1.6	U. Providence	Delaware
83	142811	14.2811	15.9	20	-1.6	U. Southampton	Bucks
84	200498	20.0498	18.8	36	11.0	U. Uwchlan	Chester
85	199065	19.9065	13.2	20	7.8	Upper Darby	Montgome
86	93648	9.3648	34.5	8	-0.7	Upper Darby	Delaware
87	163001	16.3001	22.1	50	8.0	Uwchlan T.	Chester
88	436348	43.6348	22.1	15	1.3	Villanova	Montgome
89	124478	12.4478	71.9	22	4.6	W. Chester	Chester
90	168276	16.8276	31.9	26	5.9	W. Goshen	Chester
91	114157	11.4157	44.6	38	14.6	W. Whiteland	Chester
92	130088	13.0088	28.6	19	-0.2	Warminster	Bucks
93	152624	15.2624	24.0	19	23.1	Warrington	Bucks
94	174232	17.4232	13.8	25	4.7	Westtown	Chester
95	196515	19.6515	29.9	16	1.8	Whitemarsh	Montgome
96	232714	23.2714	9.9	21	0.2	Willistown	Chester
97	245920	24.5920	22.6	10	0.3	Wynnewood	Montgome
98	130953	13.0953	13.0	24	5.2	Yardley	Bucks