In [1]:
import graphlab
In [2]:
crime_rate_data = graphlab.SFrame.read_csv('Philadelphia_Crime_Rate_noNA.csv')
In [3]:
crime_rate_data
Out[3]:
In [4]:
graphlab.canvas.set_target('ipynb')
In [5]:
crime_rate_data.show(view='Scatter Plot', x = "CrimeRate", y = "HousePrice")
In [6]:
crime_model = graphlab.linear_regression.create(crime_rate_data,
target = 'HousePrice',
features = ['CrimeRate'],
validation_set = None,
verbose = False)
In [8]:
import matplotlib.pyplot as plt
In [9]:
%matplotlib inline
In [10]:
plt.plot(crime_rate_data['CrimeRate'], crime_rate_data['HousePrice'],
'.', crime_rate_data['CrimeRate'],
crime_model.predict(crime_rate_data), '-')
Out[10]:
We can see that there is an outlier in the data, where the crime rate is high, but still, the house price is higher, hence not following the trend. This point is the center of the city (Center City data point)
Center City is one observation with extremely high crime rate and high house prices. This is an outlier in some sense. So we can remove this and re fit the model
In [11]:
crime_rate_data_noCC = crime_rate_data[crime_rate_data['MilesPhila'] != 0.0]
In [12]:
crime_rate_data_noCC.show(view='Scatter Plot', x = "CrimeRate", y = "HousePrice")
Notice the difference in the previous scatter plot and this one after removing the outlier (city center)
In [13]:
crime_model_withNoCC = graphlab.linear_regression.create(crime_rate_data_noCC,
target = 'HousePrice',
features = ['CrimeRate'],
validation_set = None,
verbose = False)
In [14]:
plt.plot(crime_rate_data_noCC['CrimeRate'], crime_rate_data_noCC['HousePrice'], '.',
crime_rate_data_noCC['CrimeRate'], crime_model_withNoCC.predict(crime_rate_data_noCC), '-')
Out[14]:
In [15]:
crime_model.get('coefficients')
Out[15]:
In [16]:
crime_model_withNoCC.get('coefficients')
Out[16]:
In [17]:
crime_rate_data_noHighEnd = crime_rate_data_noCC[crime_rate_data_noCC['HousePrice'] < 350000]
In [18]:
crime_model_noHighEnd = graphlab.linear_regression.create(crime_rate_data_noHighEnd,
target = 'HousePrice',
features = ['CrimeRate'],
validation_set = None,
verbose = False)
In [19]:
crime_model_withNoCC.get('coefficients')
Out[19]:
In [20]:
crime_model_noHighEnd.get('coefficients')
Out[20]:
We see that removing outliers wrt high-value neighborhoods has some effect on the fit but not as much as the high-leverate City Center data point. Hence, high leverage points may be much stronger candidates for influential observations but outliers may not be so.
In [ ]: